class MaskLayerImpl(Layer): expected_inputs = { 'default': StructureTemplate('T', 'B', '...'), 'mask': StructureTemplate('T', 'B', 1) } computes_no_input_deltas_for = ['mask'] def setup(self, kwargs, in_shapes): outputs = OrderedDict() outputs['default'] = in_shapes['default'] return outputs, OrderedDict(), OrderedDict() def forward_pass(self, buffers, training_pass=True): _h = self.handler flat_inp = flatten_time_and_features(buffers.inputs.default) flat_mask = flatten_time(buffers.inputs.mask) flat_out = flatten_time_and_features(buffers.outputs.default) _h.mult_mv(flat_inp, flat_mask, out=flat_out) def backward_pass(self, buffers): _h = self.handler flat_out_deltas = flatten_time_and_features( buffers.output_deltas.default) tmp = self.handler.allocate(flat_out_deltas.shape) flat_mask = flatten_time(buffers.inputs.mask) flat_in_deltas = flatten_time_and_features( buffers.input_deltas.default) _h.mult_mv(flat_out_deltas, flat_mask, tmp) _h.add_tt(tmp, flat_in_deltas, flat_in_deltas)
class HighwayLayerImpl(Layer): expected_inputs = {'H': StructureTemplate('T', 'B', '...'), 'T': StructureTemplate('T', 'B', '...'), 'x': StructureTemplate('T', 'B', '...')} def setup(self, kwargs, in_shapes): # 'H', 'T' and 'x' must have the same shape if in_shapes['H'] != in_shapes['T']: raise LayerValidationError( "{}: H and T must have the same shape but got {} and {}" .format(self.name, in_shapes['H'], in_shapes['T'])) if in_shapes['H'] != in_shapes['x']: raise LayerValidationError( "{}: H and x must have the same shape but got {} and {}" .format(self.name, in_shapes['H'], in_shapes['x'])) outputs = OrderedDict() outputs['default'] = BufferStructure( 'T', 'B', *self.in_shapes['x'].feature_shape) return outputs, OrderedDict(), OrderedDict() def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler x = buffers.inputs.x H = buffers.inputs.H T = buffers.inputs.T y = buffers.outputs.default tmp = _h.zeros(x.shape) _h.subtract_tt(H, x, out=tmp) _h.mult_tt(T, tmp, out=tmp) _h.add_tt(tmp, x, out=y) def backward_pass(self, buffers): # prepare _h = self.handler x = buffers.inputs.x H = buffers.inputs.H T = buffers.inputs.T dx = buffers.input_deltas.x dH = buffers.input_deltas.H dT = buffers.input_deltas.T dy = buffers.output_deltas.default tmp = _h.ones(dx.shape) _h.subtract_tt(tmp, T, out=tmp) _h.mult_add_tt(tmp, dy, out=dx) _h.mult_add_tt(T, dy, out=dH) _h.subtract_tt(H, x, out=tmp) _h.mult_add_tt(tmp, dy, out=dT)
class SquaredErrorLayerImpl(Layer): expected_inputs = { 'default': StructureTemplate('T', 'B', '...'), 'targets': StructureTemplate('T', 'B', '...') } expected_kwargs = {} computes_no_input_deltas_for = ['targets'] takes_no_output_deltas_from = ['predictions'] def setup(self, kwargs, in_shapes): # 'default' and 'targets' must have same shape in_shape = in_shapes['default'].feature_shape tar_shape = in_shapes['targets'].feature_shape if in_shape != tar_shape: raise LayerValidationError( "{}: default and targets must have same feature shapes but " "got {} and {}".format(self.name, in_shape, tar_shape)) outputs = OrderedDict() outputs['predictions'] = BufferStructure('T', 'B', *in_shape) outputs['loss'] = BufferStructure('T', 'B', *in_shape) internals = OrderedDict() internals['diff'] = BufferStructure('T', 'B', *in_shape) return outputs, OrderedDict(), internals def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler x = flatten_time_and_features(buffers.inputs.default) t = flatten_time_and_features(buffers.inputs.targets) diff = flatten_time_and_features(buffers.internals.diff) y = flatten_time_and_features(buffers.outputs.predictions) loss = flatten_time_and_features(buffers.outputs.loss) # calculate _h.copy_to(x, y) _h.subtract_tt(x, t, out=diff) _h.mult_tt(diff, diff, out=loss) _h.mult_st(0.5, loss, out=loss) def backward_pass(self, buffers): # prepare _h = self.handler dloss = flatten_time_and_features(buffers.output_deltas.loss) diff = flatten_time_and_features(buffers.internals.diff) dx = flatten_time_and_features(buffers.input_deltas.default) # calculate _h.mult_add_tt(dloss, diff, dx)
class MaskLayerImpl(Layer): expected_inputs = {'default': StructureTemplate('T', 'B', '...'), 'mask': StructureTemplate('T', 'B', '...')} computes_no_input_deltas_for = ['mask'] def setup(self, kwargs, in_shapes): in_shape = in_shapes['default'].feature_shape expected_shape = in_shape[:-1] + (1,) if in_shapes['mask'].feature_shape == (1,): self.flatten_dim = 2 elif in_shapes['mask'].feature_shape in [expected_shape, in_shape]: self.flatten_dim = len(in_shape) + 1 else: raise LayerValidationError( "Shape of the mask did not match shape of the default inputs. " "Should be either ('T', 'B', 1) or {} or {}, but was {}".format( ('T', 'B') + expected_shape, in_shapes['default'].shape, in_shapes['mask'])) outputs = OrderedDict() outputs['default'] = in_shapes['default'] return outputs, OrderedDict(), OrderedDict() def flatten_buffer(self, buffer): pre = buffer.shape[:self.flatten_dim] post = buffer.shape[self.flatten_dim:] return buffer.reshape((int(product(pre)), int(product(post)))) def forward_pass(self, buffers, training_pass=True): _h = self.handler flat_inp = self.flatten_buffer(buffers.inputs.default) flat_mask = self.flatten_buffer(buffers.inputs.mask) flat_out = self.flatten_buffer(buffers.outputs.default) _h.mult_mv(flat_inp, flat_mask, out=flat_out) def backward_pass(self, buffers): _h = self.handler flat_out_deltas = self.flatten_buffer(buffers.output_deltas.default) tmp = self.handler.allocate(flat_out_deltas.shape) flat_mask = self.flatten_buffer(buffers.inputs.mask) flat_in_deltas = self.flatten_buffer(buffers.input_deltas.default) _h.mult_mv(flat_out_deltas, flat_mask, tmp) _h.add_tt(tmp, flat_in_deltas, flat_in_deltas)
class SquaredDifferenceLayerImpl(Layer): expected_inputs = {'inputs_1': StructureTemplate('T', 'B', '...'), 'inputs_2': StructureTemplate('T', 'B', '...')} expected_kwargs = {} def setup(self, kwargs, in_shapes): # 'inputs_1' and 'inputs_2' must have same shape f_shape1 = in_shapes['inputs_1'].feature_shape f_shape2 = in_shapes['inputs_2'].feature_shape if f_shape1 != f_shape2: raise LayerValidationError( "{}: inputs_1 and inputs_2 must have same feature shapes but " "got {} and {}".format(self.name, f_shape1, f_shape2)) outputs = OrderedDict() outputs['default'] = BufferStructure('T', 'B', *f_shape1) internals = OrderedDict() feature_shape = self.in_shapes['inputs_1'].feature_shape internals['diff'] = BufferStructure('T', 'B', *feature_shape) return outputs, OrderedDict(), internals def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler inputs_1 = flatten_time_and_features(buffers.inputs.inputs_1) inputs_2 = flatten_time_and_features(buffers.inputs.inputs_2) diff = flatten_time_and_features(buffers.internals.diff) outputs = flatten_time_and_features(buffers.outputs.default) # calculate _h.subtract_tt(inputs_1, inputs_2, out=diff) _h.mult_tt(diff, diff, out=outputs) def backward_pass(self, buffers): # prepare _h = self.handler out_deltas = flatten_time_and_features(buffers.output_deltas.default) diff = flatten_time_and_features(buffers.internals.diff) dinputs_1 = flatten_time_and_features(buffers.input_deltas.inputs_1) dinputs_2 = flatten_time_and_features(buffers.input_deltas.inputs_2) tmp = _h.allocate(out_deltas.shape) # calculate _h.mult_st(2, out_deltas, out=out_deltas) _h.mult_add_tt(out_deltas, diff, out=dinputs_1) _h.mult_st(-1, diff, out=tmp) _h.mult_add_tt(out_deltas, tmp, out=dinputs_2)
class SquareLayerImpl(Layer): # accept inputs in any format expected_inputs = {'default': StructureTemplate('...')} # no kwargs supported expected_kwargs = {} # For a custom layer we need to implement the following 3 methods: def setup(self, kwargs, in_shapes): # In this method we set up the buffer structure of the layer # we can use the kwargs passed to this layer (here we don't) # and the shapes of the inputs (an OrderedDict[str, BufferStructure]) # This layer is elementwise so the output shapes should be the same as # the input shapes outputs = in_shapes parameters = OrderedDict() # No parameters so this is empty internals = OrderedDict() # Also no need for internal buffers return outputs, parameters, internals def forward_pass(self, buffers, training_pass=True): inputs = buffers.inputs.default outputs = buffers.outputs.default self.handler.mult_tt(inputs, inputs, outputs) self.handler.mult_st(0.5, outputs, outputs) def backward_pass(self, buffers): inputs = buffers.inputs.default output_deltas = buffers.output_deltas.default input_deltas = buffers.input_deltas.default self.handler.mult_add_tt(inputs, output_deltas, input_deltas)
class DropoutLayerImpl(Layer): expected_inputs = {'default': StructureTemplate('T', 'B', '...')} expected_kwargs = {'drop_prob'} def setup(self, kwargs, in_shapes): self.drop_prob = kwargs.get('drop_prob', 0.5) outputs = OrderedDict() outputs['default'] = in_shapes['default'] internals = OrderedDict() internals['mask'] = self.in_shapes['default'] return outputs, OrderedDict(), internals def forward_pass(self, buffers, training_pass=True): _h = self.handler if training_pass: _h.generate_probability_mask(buffers.internals.mask, 1 - self.drop_prob) _h.mult_tt(buffers.inputs.default, buffers.internals.mask, out=buffers.outputs.default) _h.mult_st(1 / (1 - self.drop_prob), buffers.outputs.default, out=buffers.outputs.default) else: _h.copy_to(buffers.inputs.default, buffers.outputs.default) def backward_pass(self, buffers): self.handler.mult_add_tt(buffers.output_deltas.default, buffers.internals.mask, buffers.input_deltas.default)
class FullyConnectedLayerImpl(Layer): expected_inputs = {'default': StructureTemplate('T', 'B', '...')} expected_kwargs = {'size', 'activation'} def setup(self, kwargs, in_shapes): self.activation = kwargs.get('activation', 'rel') self.size = kwargs.get('size', self.in_shapes['default'].feature_shape) if isinstance(self.size, int): self.size = (self.size, ) if not isinstance(self.size, (tuple, list)): raise LayerValidationError('size must be int but was {}'.format( self.size)) in_size = in_shapes['default'].feature_size outputs = OrderedDict() outputs['default'] = BufferStructure('T', 'B', *self.size) out_size = outputs['default'].feature_size parameters = OrderedDict() parameters['W'] = BufferStructure(out_size, in_size) parameters['bias'] = BufferStructure(out_size) internals = OrderedDict() return outputs, parameters, internals def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler W, bias = buffers.parameters inputs = flatten_time_and_features(buffers.inputs.default) outputs = flatten_time_and_features(buffers.outputs.default) # calculate outputs _h.dot_mm(inputs, W, outputs, transb=True) _h.add_mv(outputs, bias.reshape((1, bias.shape[0])), outputs) _h.inplace_act_func[self.activation](outputs) def backward_pass(self, buffers): # prepare _h = self.handler W, bias = buffers.parameters dW, dbias = buffers.gradients inputs = flatten_time_and_features(buffers.inputs.default) outputs = flatten_time_and_features(buffers.outputs.default) in_deltas = flatten_time_and_features(buffers.input_deltas.default) out_deltas = flatten_time_and_features(buffers.output_deltas.default) # calculate in_deltas and gradients _h.inplace_act_func_deriv[self.activation](outputs, out_deltas) _h.dot_add_mm(out_deltas, W, out=in_deltas) _h.dot_mm(out_deltas, inputs, out=dW, transa=True) _h.sum_t(out_deltas, axis=0, out=dbias)
class L2DecayLayerImpl(Layer): expected_inputs = {'default': StructureTemplate('T', 'B', '...')} expected_kwargs = {} def setup(self, kwargs, in_shapes): outputs = OrderedDict() outputs['loss'] = BufferStructure('T', 'B', 1) parameters = OrderedDict() internals = OrderedDict() internals['tmp'] = in_shapes['default'] internals['dsq_activations'] = BufferStructure( *in_shapes['default'].shape, is_backward_only=True) return outputs, parameters, internals def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler assert isinstance(_h, Handler) inputs = buffers.inputs.default tmp = buffers.internals.tmp outputs = buffers.outputs.loss # reshape flat_inputs = flatten_time_and_features(inputs) flat_tmp = flatten_time_and_features(tmp) flat_outputs = flatten_time(outputs) # compute _h.mult_tt(flat_inputs, flat_inputs, flat_tmp) _h.mult_st(0.5, flat_tmp, flat_tmp) _h.sum_t(flat_tmp, 1, flat_outputs) def backward_pass(self, buffers): _h = self.handler assert isinstance(_h, Handler) inputs = buffers.inputs.default tmp = buffers.internals.tmp output_deltas = buffers.output_deltas.loss input_deltas = buffers.input_deltas.default # reshape flat_inputs = flatten_time_and_features(inputs) flat_tmp = flatten_time_and_features(tmp) flat_output_deltas = flatten_time(output_deltas) flat_input_deltas = flatten_time_and_features(input_deltas) # compute _h.mult_mv(flat_inputs, flat_output_deltas, flat_tmp) _h.add_tt(flat_tmp, flat_input_deltas, flat_input_deltas)
class MaskLayerImpl(Layer): expected_inputs = {'default': StructureTemplate('T', 'B', '...'), 'mask': StructureTemplate('T', 'B', '...')} computes_no_input_deltas_for = ['mask'] def setup(self, kwargs, in_shapes): in_shape = in_shapes['default'].feature_shape if in_shapes['mask'].feature_shape not in [(1,), in_shape]: raise LayerValidationError( "Shape of the mask did not match shape of the default inputs. " "Should be either ('T', 'B', 1) or {}, but was {}".format( in_shapes['default'].shape), in_shapes['mask'].shape) outputs = OrderedDict() outputs['default'] = in_shapes['default'] return outputs, OrderedDict(), OrderedDict() def forward_pass(self, buffers, training_pass=True): _h = self.handler flat_inp = flatten_time_and_features(buffers.inputs.default) flat_mask = flatten_time_and_features(buffers.inputs.mask) flat_out = flatten_time_and_features(buffers.outputs.default) _h.mult_mv(flat_inp, flat_mask, out=flat_out) def backward_pass(self, buffers): _h = self.handler flat_out_deltas = flatten_time_and_features( buffers.output_deltas.default) tmp = self.handler.allocate(flat_out_deltas.shape) flat_mask = flatten_time_and_features(buffers.inputs.mask) flat_in_deltas = flatten_time_and_features( buffers.input_deltas.default) _h.mult_mv(flat_out_deltas, flat_mask, tmp) _h.add_tt(tmp, flat_in_deltas, flat_in_deltas)
class MergeLayerImpl(Layer): expected_inputs = { 'inputs_1': StructureTemplate('...'), 'inputs_2': StructureTemplate('...') } expected_kwargs = {} def setup(self, kwargs, in_shapes): # 'inputs_1' and 'inputs_2' must have same shape except for last dim shape_prefix1 = in_shapes['inputs_1'].shape[:-1] shape_prefix2 = in_shapes['inputs_2'].shape[:-1] if shape_prefix1 != shape_prefix2: raise LayerValidationError( "{}: The shapes of inputs_1 and inputs_2 may only differ in " "the last dimension but got {} and {}".format( self.name, in_shapes['inputs_1'].shape, in_shapes['inputs_2'].shape)) combined_size = (in_shapes['inputs_1'].shape[-1] + in_shapes['inputs_2'].shape[-1]) out_shape = shape_prefix1 + (combined_size, ) outputs = OrderedDict() outputs['default'] = BufferStructure(*out_shape) parameters = OrderedDict() internals = OrderedDict() return outputs, parameters, internals def forward_pass(self, buffers, training_pass=True): # prepare self.handler.merge_tt(buffers.inputs.inputs_1, buffers.inputs.inputs_2, buffers.outputs.default) def backward_pass(self, buffers): # prepare _h = self.handler self.handler.split_add_tt(buffers.output_deltas.default, buffers.input_deltas.inputs_1, buffers.input_deltas.inputs_2)
class DeltasScalingLayerImpl(Layer): expected_inputs = {'default': StructureTemplate('T', 'B', '...')} expected_kwargs = {'factor'} def setup(self, kwargs, in_shapes): if 'factor' not in kwargs: raise LayerValidationError('Missing required "factor" argument') self.factor = kwargs['factor'] out_shapes = in_shapes return out_shapes, OrderedDict(), OrderedDict() def forward_pass(self, buffers, training_pass=True): self.handler.copy_to(buffers.inputs.default, buffers.outputs.default) def backward_pass(self, buffers): self.handler.mult_add_st(self.factor, buffers.output_deltas.default, buffers.input_deltas.default)
class ElementwiseLayerImpl(Layer): expected_inputs = {'default': StructureTemplate('T', 'B', '...')} expected_kwargs = {'activation'} def setup(self, kwargs, in_shapes): self.activation = kwargs.get('activation', 'rel') return in_shapes, OrderedDict(), OrderedDict() def forward_pass(self, buffers, training_pass=True): self.handler.act_func[self.activation](buffers.inputs.default, buffers.outputs.default) def backward_pass(self, buffers): tmp = self.handler.allocate(buffers.input_deltas.default.shape) self.handler.act_func_deriv[self.activation]( buffers.inputs.default, buffers.outputs.default, buffers.output_deltas.default, tmp) self.handler.add_tt(buffers.input_deltas.default, tmp, buffers.input_deltas.default)
class LossLayerImpl(Layer): expected_inputs = {'default': StructureTemplate('...')} expected_kwargs = {'importance'} def setup(self, kwargs, in_shapes): assert self.name != 'total_loss' self.importance = kwargs.get('importance', 1.0) self.batch_index = None if in_shapes['default'].scales_with_time: self.batch_index = 1 elif in_shapes['default'].scales_with_batch_size: self.batch_index = 0 outputs = OrderedDict() outputs['loss'] = BufferStructure(1) return outputs, OrderedDict(), OrderedDict() def forward_pass(self, buffers, training_pass=True): if self.batch_index is None: batch_size = 1.0 else: batch_size = buffers.inputs.default.shape[self.batch_index] self.handler.sum_t(buffers.inputs.default, None, buffers.outputs.loss.reshape(tuple())) self.handler.mult_st(self.importance / batch_size, buffers.outputs.loss, buffers.outputs.loss) def backward_pass(self, buffers): if self.batch_index is None: batch_size = 1.0 else: batch_size = buffers.inputs.default.shape[self.batch_index] self.handler.add_st(self.importance / batch_size, buffers.input_deltas.default, buffers.input_deltas.default)
def test_illegal_structure_template_raise(shape): with pytest.raises(StructureValidationError): StructureTemplate(*shape)
class LstmLayerImpl(Layer): expected_inputs = {'default': StructureTemplate('T', 'B', 'F')} expected_kwargs = {'size', 'activation'} def setup(self, kwargs, in_shapes): self.activation = kwargs.get('activation', 'tanh') in_size = in_shapes['default'].feature_size self.size = kwargs.get('size', in_size) if not isinstance(self.size, int): raise LayerValidationError('size must be int but was {}'. format(self.size)) outputs = OrderedDict() outputs['default'] = BufferStructure('T', 'B', self.size, context_size=1) parameters = OrderedDict() parameters['Wz'] = BufferStructure(self.size, in_size) parameters['Wi'] = BufferStructure(self.size, in_size) parameters['Wf'] = BufferStructure(self.size, in_size) parameters['Wo'] = BufferStructure(self.size, in_size) parameters['pi'] = BufferStructure(1, self.size) parameters['pf'] = BufferStructure(1, self.size) parameters['po'] = BufferStructure(1, self.size) parameters['Rz'] = BufferStructure(self.size, self.size) parameters['Ri'] = BufferStructure(self.size, self.size) parameters['Rf'] = BufferStructure(self.size, self.size) parameters['Ro'] = BufferStructure(self.size, self.size) parameters['bz'] = BufferStructure(self.size) parameters['bi'] = BufferStructure(self.size) parameters['bf'] = BufferStructure(self.size) parameters['bo'] = BufferStructure(self.size) internals = OrderedDict() internals['Za'] = BufferStructure('T', 'B', self.size, context_size=1) internals['Zb'] = BufferStructure('T', 'B', self.size, context_size=1) internals['Ia'] = BufferStructure('T', 'B', self.size, context_size=1) internals['Ib'] = BufferStructure('T', 'B', self.size, context_size=1) internals['Fa'] = BufferStructure('T', 'B', self.size, context_size=1) internals['Fb'] = BufferStructure('T', 'B', self.size, context_size=1) internals['Oa'] = BufferStructure('T', 'B', self.size, context_size=1) internals['Ob'] = BufferStructure('T', 'B', self.size, context_size=1) internals['Ca'] = BufferStructure('T', 'B', self.size, context_size=1) internals['Cb'] = BufferStructure('T', 'B', self.size, context_size=1) internals['dZa'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dZb'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dIa'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dIb'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dFa'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dFb'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dOa'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dOb'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dCa'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dCb'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) return outputs, parameters, internals def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler (Wz, Wi, Wf, Wo, pi, pf, po, Rz, Ri, Rf, Ro, bz, bi, bf, bo) = buffers.parameters (Za, Zb, Ia, Ib, Fa, Fb, Oa, Ob, Ca, Cb, dZa, dZb, dIa, dIb, dFa, dFb, dOa, dOb, dCa, dCb) = buffers.internals x = buffers.inputs.default y = buffers.outputs.default time_size, batch_size, in_size = x.shape flat_x = flatten_time(x) flat_Za = flatten_time(Za[:-1]) flat_Ia = flatten_time(Ia[:-1]) flat_Fa = flatten_time(Fa[:-1]) flat_Oa = flatten_time(Oa[:-1]) _h.dot_mm(flat_x, Wz, flat_Za, transb=True) _h.dot_mm(flat_x, Wi, flat_Ia, transb=True) _h.dot_mm(flat_x, Wf, flat_Fa, transb=True) _h.dot_mm(flat_x, Wo, flat_Oa, transb=True) for t in range(time_size): # Block input _h.dot_add_mm(y[t - 1], Rz, Za[t], transb=True) _h.add_mv(Za[t], bz.reshape((1, self.size)), Za[t]) _h.act_func[self.activation](Za[t], Zb[t]) # Input Gate _h.dot_add_mm(y[t - 1], Ri, Ia[t], transb=True) _h.mult_add_mv(Ca[t - 1], pi, Ia[t]) _h.add_mv(Ia[t], bi.reshape((1, self.size)), Ia[t]) _h.sigmoid(Ia[t], Ib[t]) # Forget Gate _h.dot_add_mm(y[t - 1], Rf, Fa[t], transb=True) _h.mult_add_mv(Ca[t - 1], pf, Fa[t]) _h.add_mv(Fa[t], bf.reshape((1, self.size)), Fa[t]) _h.sigmoid(Fa[t], Fb[t]) # Cell _h.mult_tt(Ib[t], Zb[t], Ca[t]) _h.mult_add_tt(Fb[t], Ca[t - 1], Ca[t]) # Output Gate _h.dot_add_mm(y[t - 1], Ro, Oa[t], transb=True) _h.mult_add_mv(Ca[t], po, Oa[t]) _h.add_mv(Oa[t], bo.reshape((1, self.size)), Oa[t]) _h.sigmoid(Oa[t], Ob[t]) # Block output _h.act_func[self.activation](Ca[t], Cb[t]) _h.mult_tt(Ob[t], Cb[t], y[t]) def backward_pass(self, buffers): # prepare _h = self.handler (Wz, Wi, Wf, Wo, pi, pf, po, Rz, Ri, Rf, Ro, bz, bi, bf, bo) = buffers.parameters (dWz, dWi, dWf, dWo, dpi, dpf, dpo, dRz, dRi, dRf, dRo, dbz, dbi, dbf, dbo) = buffers.gradients (Za, Zb, Ia, Ib, Fa, Fb, Oa, Ob, Ca, Cb, dZa, dZb, dIa, dIb, dFa, dFb, dOa, dOb, dCa, dCb) = buffers.internals x = buffers.inputs.default dx = buffers.input_deltas.default y = buffers.outputs.default deltas = buffers.output_deltas.default dy = _h.allocate(y.shape) _h.fill(dCa, 0.0) time_size, batch_size, in_size = x.shape for t in range(time_size - 1, -1, - 1): # Accumulate recurrent deltas _h.copy_to(deltas[t], dy[t]) _h.dot_add_mm(dIa[t + 1], Ri, dy[t]) _h.dot_add_mm(dFa[t + 1], Rf, dy[t]) _h.dot_add_mm(dOa[t + 1], Ro, dy[t]) _h.dot_add_mm(dZa[t + 1], Rz, dy[t]) # Peephole connection part: _h.mult_add_mv(dIa[t + 1], pi, dCa[t]) _h.mult_add_mv(dFa[t + 1], pf, dCa[t]) # Output Gate _h.mult_tt(dy[t], Cb[t], dOb[t]) _h.sigmoid_deriv(Oa[t], Ob[t], dOb[t], dOa[t]) # Peephole connection _h.mult_add_mv(dOa[t], po, dCa[t]) # Cell _h.mult_tt(dy[t], Ob[t], dCb[t]) _h.act_func_deriv[self.activation](Ca[t], Cb[t], dCb[t], dCb[t]) _h.add_tt(dCa[t], dCb[t], dCa[t]) _h.mult_add_tt(dCa[t + 1], Fb[t + 1], dCa[t]) # Forget Gate _h.mult_tt(dCa[t], Ca[t - 1], dFb[t]) _h.sigmoid_deriv(Fa[t], Fb[t], dFb[t], dFa[t]) # Input Gate _h.mult_tt(dCa[t], Zb[t], dIb[t]) _h.sigmoid_deriv(Ia[t], Ib[t], dIb[t], dIa[t]) # Block Input _h.mult_tt(dCa[t], Ib[t], dZb[t]) _h.act_func_deriv[self.activation](Za[t], Zb[t], dZb[t], dZa[t]) flat_inputs = flatten_time(x) flat_dinputs = flatten_time(dx) flat_dIa = flatten_time(dIa[:-1]) flat_dFa = flatten_time(dFa[:-1]) flat_dOa = flatten_time(dOa[:-1]) flat_dZa = flatten_time(dZa[:-1]) # Calculate in_deltas and gradients _h.dot_add_mm(flat_dIa, Wi, flat_dinputs) _h.dot_add_mm(flat_dFa, Wf, flat_dinputs) _h.dot_add_mm(flat_dOa, Wo, flat_dinputs) _h.dot_add_mm(flat_dZa, Wz, flat_dinputs) _h.dot_add_mm(flat_dIa, flat_inputs, dWi, transa=True) _h.dot_add_mm(flat_dFa, flat_inputs, dWf, transa=True) _h.dot_add_mm(flat_dOa, flat_inputs, dWo, transa=True) _h.dot_add_mm(flat_dZa, flat_inputs, dWz, transa=True) dbias_tmp = _h.allocate(dbz.shape) _h.sum_t(flat_dIa, axis=0, out=dbias_tmp) _h.add_tt(dbi, dbias_tmp, dbi) _h.sum_t(flat_dFa, axis=0, out=dbias_tmp) _h.add_tt(dbf, dbias_tmp, dbf) _h.sum_t(flat_dOa, axis=0, out=dbias_tmp) _h.add_tt(dbo, dbias_tmp, dbo) _h.sum_t(flat_dZa, axis=0, out=dbias_tmp) _h.add_tt(dbz, dbias_tmp, dbz) flat_outputs = flatten_time(y[:-2]) flat_cell = flatten_time(Ca[:-2]) flat_cell2 = flatten_time(Ca[:-1]) dWco_tmp = _h.allocate(flat_cell2.shape) dWc_tmp = _h.allocate(dpo.shape) # Output gate Peephole _h.mult_tt(flat_cell2, flat_dOa, dWco_tmp) _h.sum_t(dWco_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpo, dWc_tmp, dpo) flat_dIa = flatten_time(dIa[1:-1]) flat_dFa = flatten_time(dFa[1:-1]) flat_dOa = flatten_time(dOa[1:-1]) flat_dZa = flatten_time(dZa[1:-1]) _h.dot_add_mm(flat_dIa, flat_outputs, dRi, transa=True) _h.dot_add_mm(flat_dFa, flat_outputs, dRf, transa=True) _h.dot_add_mm(flat_dOa, flat_outputs, dRo, transa=True) _h.dot_add_mm(flat_dZa, flat_outputs, dRz, transa=True) _h.dot_add_mm(dIa[0], dy[-1], dRi, transa=True) _h.dot_add_mm(dFa[0], dy[-1], dRf, transa=True) _h.dot_add_mm(dOa[0], dy[-1], dRo, transa=True) _h.dot_add_mm(dZa[0], dy[-1], dRz, transa=True) # Other Peephole connections dWcif_tmp = _h.allocate(flat_cell.shape) _h.mult_tt(flat_cell, flat_dIa, dWcif_tmp) _h.sum_t(dWcif_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpi, dWc_tmp, dpi) _h.mult_tt(flat_cell, flat_dFa, dWcif_tmp) _h.sum_t(dWcif_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpf, dWc_tmp, dpf) dWcif_tmp = _h.allocate(dIa[0].shape) _h.mult_tt(dCa[-1], dIa[0], dWcif_tmp) _h.sum_t(dWcif_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpi, dWc_tmp, dpi) _h.mult_tt(dCa[-1], dIa[0], dWcif_tmp) _h.sum_t(dWcif_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpf, dWc_tmp, dpf)
class Pooling2DLayerImpl(Layer): expected_inputs = {'default': StructureTemplate('T', 'B', '...')} expected_kwargs = {'kernel_size', 'type', 'stride', 'padding', 'activation_function'} def setup(self, kwargs, in_shapes): assert 'kernel_size' in kwargs, "kernel_size must be specified for " \ "Pooling2D" assert 'type' in kwargs, "type must be specified for Pooling2D" kernel_size = kwargs['kernel_size'] ptype = kwargs['type'] padding = kwargs.get('padding', 0) stride = kwargs.get('stride', (1, 1)) in_shape = self.in_shapes['default'].feature_shape assert ptype in ('max', 'avg') assert type(padding) is int and padding >= 0, \ "Invalid padding: {}".format(padding) assert type(kernel_size) in [list, tuple] and \ len(kernel_size) == 2, "Kernel size must be list or " \ "tuple of length 2: {}".format( kernel_size) assert type(stride) in [list, tuple] and len(stride) == 2, \ "Stride must be list or tuple of length 2: {}".format(stride) assert stride[0] >= 0 and stride[1] >= 0, \ "Invalid stride: {}".format(stride) assert isinstance(in_shape, tuple) and len(in_shape) == 3, \ "PoolingLayer2D must have 3 dimensional input but input " \ "shape was %s" % in_shape self.kernel_size = tuple(kernel_size) self.type = ptype self.padding = padding self.stride = tuple(stride) output_height = ((in_shape[0] + 2 * padding - kernel_size[0]) // stride[0]) + 1 output_width = ((in_shape[1] + 2 * padding - kernel_size[1]) // stride[1]) + 1 assert output_height > 0 and output_width > 0, \ "Evaluated output height and width must be positive but were " \ "({}, {})".format(output_height, output_width) output_shape = (output_height, output_width, in_shape[2]) outputs = OrderedDict() outputs['default'] = BufferStructure('T', 'B', *output_shape) internals = OrderedDict() if self.type == 'max': argmax_shape = outputs['default'].feature_shape internals['argmax'] = BufferStructure('T', 'B', *argmax_shape) return outputs, OrderedDict(), internals def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler inputs = buffers.inputs.default outputs = buffers.outputs.default # reshape flat_inputs = flatten_time(inputs) flat_outputs = flatten_time(outputs) # calculate outputs if self.type == 'max': argmax = buffers.internals.argmax flat_argmax = flatten_time(argmax) _h.maxpool2d_forward_batch(flat_inputs, self.kernel_size, flat_outputs, self.padding, self.stride, flat_argmax) elif self.type == 'avg': _h.avgpool2d_forward_batch(flat_inputs, self.kernel_size, flat_outputs, self.padding, self.stride) def backward_pass(self, buffers): # prepare _h = self.handler inputs = buffers.inputs.default outputs = buffers.outputs.default in_deltas = buffers.input_deltas.default out_deltas = buffers.output_deltas.default # reshape flat_inputs = flatten_time(inputs) flat_in_deltas = flatten_time(in_deltas) flat_out_deltas = flatten_time(out_deltas) flat_outputs = flatten_time(outputs) if self.type == 'max': argmax = buffers.internals.argmax flat_argmax = flatten_time(argmax) _h.maxpool2d_backward_batch(flat_inputs, self.kernel_size, flat_outputs, self.padding, self.stride, flat_argmax, flat_in_deltas, flat_out_deltas) elif self.type == 'avg': _h.avgpool2d_backward_batch(flat_inputs, self.kernel_size, flat_outputs, self.padding, self.stride, flat_in_deltas, flat_out_deltas)
class BinomialCrossEntropyLayerImpl(Layer): expected_inputs = {'default': StructureTemplate('T', 'B', '...'), 'targets': StructureTemplate('T', 'B', '...')} expected_kwargs = {} computes_no_input_deltas_for = ['targets'] def setup(self, kwargs, in_shapes): if in_shapes['default'] != in_shapes['targets']: raise LayerValidationError("{}: default and targets must have the " "same shapes but got {} and {}" .format(self.name, in_shapes['default'], in_shapes['targets'])) outputs = OrderedDict() outputs['default'] = BufferStructure('T', 'B', 1) feature_shape = in_shapes['default'].feature_shape internals = OrderedDict() internals['cee'] = BufferStructure('T', 'B', *feature_shape) internals['ceed'] = BufferStructure('T', 'B', *feature_shape, is_backward_only=True) return outputs, OrderedDict(), internals def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler y = buffers.inputs.default t = buffers.inputs.targets cee = buffers.internals.cee cee_sum = buffers.outputs.default # the binomial cross entropy error is given by # - t * ln(y) - (1-t) * ln(1-y) tmp = _h.ones(cee.shape) _h.subtract_tt(tmp, y, cee) # cee = 1-y _h.subtract_tt(tmp, t, tmp) # tmp = 1-t _h.clip_t(cee, 1e-6, 1.0, cee) _h.log_t(cee, cee) # cee = ln(1-y) _h.mult_tt(tmp, cee, tmp) # tmp = (1-t) * ln(1-y) _h.clip_t(y, 1e-6, 1.0, cee) _h.log_t(cee, cee) # cee = ln(y) _h.mult_tt(t, cee, cee) # cee = t * ln(y) _h.add_tt(tmp, cee, cee) # cee = (1-t) * ln(1-y) + t * ln(y) # reshape for summation cee = flatten_time_and_features(cee) cee_sum = flatten_time(cee_sum) _h.sum_t(cee, axis=1, out=cee_sum) _h.mult_st(-1, cee_sum, cee_sum) # * -1 def backward_pass(self, buffers): # prepare _h = self.handler ceed_sum = buffers.output_deltas.default ceed = buffers.internals.ceed tmp = _h.allocate(ceed.shape) y = buffers.inputs.default t = buffers.inputs.targets yd = buffers.input_deltas.default # the derivative of the binomial cross entropy error is given by # (y - t) / (y - y²) _h.mult_tt(y, y, ceed) # ceed = y² _h.subtract_tt(y, ceed, ceed) # ceed = y - y² _h.clip_t(ceed, 1e-6, 1.0, ceed) # clip _h.subtract_tt(y, t, tmp) # tmp = y - t _h.divide_tt(tmp, ceed, ceed) # ceed = (y - t) / (y - y²) # ceed_sum has only one feature dimension due to summation, # so we broadcast to all feature dimensions _h.broadcast_t(ceed_sum, 2, tmp) _h.mult_tt(ceed, tmp, ceed) _h.add_tt(ceed, yd, yd)
def test_structure_template_matches4(shape, expected): st = StructureTemplate(1, 2, 7) struct = BufferStructure(*shape) assert st.matches(struct) == expected
class HighwayRNNCoupledGatesLayerImpl(Layer): expected_inputs = {'default': StructureTemplate('T', 'B', '...')} expected_kwargs = { 'size', 'activation', 'recurrence_depth', 'block_size', 'sizes_list' } def setup(self, kwargs, in_shapes): self.activation = kwargs.get('activation', 'tanh') self.size = kwargs.get('size', self.in_shapes['default'].feature_size) self.recurrence_depth = kwargs.get('recurrence_depth', 1) if not isinstance(self.size, int): raise LayerValidationError('size must be int but was {}'.format( self.size)) if not isinstance(self.recurrence_depth, int): raise LayerValidationError( 'recurrence_depth must be int but was {}'.format( self.recurrence_depth)) in_size = self.in_shapes['default'].feature_size outputs = OrderedDict() outputs['default'] = BufferStructure('T', 'B', self.size, context_size=1) parameters = OrderedDict() parameters['W_H'] = BufferStructure(self.size, in_size) parameters['W_T'] = BufferStructure(self.size, in_size) parameters['R_T'] = BufferStructure(self.recurrence_depth, self.size, self.size) parameters['bias_T'] = BufferStructure(self.recurrence_depth, self.size) parameters['R_H'] = (BufferStructure(self.recurrence_depth, self.size, self.size)) parameters['bias_H'] = BufferStructure(self.recurrence_depth, self.size) internals = OrderedDict() for i in range(self.recurrence_depth): internals['H_{}'.format(i)] = BufferStructure('T', 'B', self.size, context_size=1) internals['T_{}'.format(i)] = BufferStructure('T', 'B', self.size, context_size=1) internals['Y_{}'.format(i)] = BufferStructure('T', 'B', self.size, context_size=1) internals['dH_{}'.format(i)] = BufferStructure( 'T', 'B', self.size, context_size=1, is_backward_only=True) internals['dT_{}'.format(i)] = BufferStructure( 'T', 'B', self.size, context_size=1, is_backward_only=True) internals['dY_{}'.format(i)] = BufferStructure( 'T', 'B', self.size, context_size=1, is_backward_only=True) return outputs, parameters, internals def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler W_H, W_T, R_T, bias_T, R_H, bias_H = buffers.parameters inputs = buffers.inputs.default outputs = buffers.outputs.default H_list = [] T_list = [] Y_list = [] for i in range(self.recurrence_depth): H_list.append(buffers.internals['H_{}'.format(i)]) T_list.append(buffers.internals['T_{}'.format(i)]) Y_list.append(buffers.internals['Y_{}'.format(i)]) flat_inputs = flatten_time_and_features(inputs) flat_H = flatten_time(H_list[0][:-1]) flat_T = flatten_time(T_list[0][:-1]) _h.dot_mm(flat_inputs, W_H, flat_H, transb=True) _h.dot_mm(flat_inputs, W_T, flat_T, transb=True) for t in range(inputs.shape[0]): for i in range(self.recurrence_depth): if i == 0: x = outputs[t - 1] _h.dot_add_mm(x, R_T[i], T_list[i][t], transb=True) _h.add_mv(T_list[i][t], bias_T[i].reshape((1, self.size)), T_list[i][t]) _h.inplace_act_func['sigmoid'](T_list[i][t]) _h.dot_add_mm(x, R_H[i], H_list[i][t], transb=True) _h.add_mv(H_list[i][t], bias_H[i].reshape((1, self.size)), H_list[i][t]) _h.inplace_act_func[self.activation](H_list[i][t]) else: x = Y_list[i - 1][t] _h.dot_mm(x, R_T[i], T_list[i][t], transb=True) _h.add_mv(T_list[i][t], bias_T[i].reshape((1, self.size)), T_list[i][t]) _h.inplace_act_func['sigmoid'](T_list[i][t]) _h.dot_mm(x, R_H[i], H_list[i][t], transb=True) _h.add_mv(H_list[i][t], bias_H[i].reshape((1, self.size)), H_list[i][t]) _h.inplace_act_func[self.activation](H_list[i][t]) if i == 0: _h.mult_tt(T_list[i][t], H_list[i][t], out=Y_list[i][t]) tmp = _h.ones(H_list[i][t].shape) _h.subtract_tt(tmp, T_list[i][t], tmp) _h.mult_add_tt(tmp, outputs[t - 1], out=Y_list[i][t]) else: _h.mult_tt(T_list[i][t], H_list[i][t], out=Y_list[i][t]) tmp = _h.ones(H_list[i][t].shape) _h.subtract_tt(tmp, T_list[i][t], tmp) _h.mult_add_tt(tmp, Y_list[i - 1][t], out=Y_list[i][t]) _h.copy_to(Y_list[self.recurrence_depth - 1][t], outputs[t]) def backward_pass(self, buffers): # prepare _h = self.handler W_H, W_T, R_T, bias_T, R_H, bias_H = buffers.parameters dW_H, dW_T, dR_T, dbias_T, dR_H, dbias_H = buffers.gradients inputs = buffers.inputs.default outputs = buffers.outputs.default dinputs = buffers.input_deltas.default doutputs = buffers.output_deltas.default H_list = [] T_list = [] Y_list = [] dH_list = [] dT_list = [] dY_list = [] for i in range(self.recurrence_depth): H_list.append(buffers.internals['H_{}'.format(i)]) T_list.append(buffers.internals['T_{}'.format(i)]) Y_list.append(buffers.internals['Y_{}'.format(i)]) dH_list.append(buffers.internals['dH_{}'.format(i)]) dT_list.append(buffers.internals['dT_{}'.format(i)]) dY_list.append(buffers.internals['dY_{}'.format(i)]) t = inputs.shape[0] - 1 _h.copy_to(doutputs[t], dY_list[self.recurrence_depth - 1][t]) for i in range(self.recurrence_depth - 1, -1, -1): if i == 0: _h.mult_tt(dY_list[i][t], T_list[i][t], dH_list[i][t]) tmp = _h.ones(dH_list[i][t].shape) _h.subtract_tt(H_list[i][t], outputs[t - 1], tmp) _h.mult_tt(dY_list[i][t], tmp, dT_list[i][t]) _h.inplace_act_func_deriv['sigmoid'](T_list[i][t], dT_list[i][t]) _h.inplace_act_func_deriv[self.activation](H_list[i][t], dH_list[i][t]) else: _h.mult_tt(dY_list[i][t], T_list[i][t], dH_list[i][t]) tmp = _h.ones(dH_list[i][t].shape) _h.subtract_tt(tmp, T_list[i][t], tmp) _h.mult_tt(dY_list[i][t], tmp, dY_list[i - 1][t]) _h.subtract_tt(H_list[i][t], Y_list[i - 1][t], tmp) _h.mult_tt(dY_list[i][t], tmp, dT_list[i][t]) _h.inplace_act_func_deriv['sigmoid'](T_list[i][t], dT_list[i][t]) _h.inplace_act_func_deriv[self.activation](H_list[i][t], dH_list[i][t]) _h.dot_add_mm(dT_list[i][t], R_T[i], dY_list[i - 1][t]) _h.dot_add_mm(dH_list[i][t], R_H[i], dY_list[i - 1][t]) for t in range(inputs.shape[0] - 2, -1, -1): _h.dot_add_mm(dT_list[0][t + 1], R_T[0], doutputs[t]) _h.dot_add_mm(dH_list[0][t + 1], R_H[0], doutputs[t]) tmp = _h.ones(dH_list[0][t + 1].shape) _h.subtract_tt(tmp, T_list[0][t + 1], tmp) _h.mult_add_tt(dY_list[0][t + 1], tmp, doutputs[t]) _h.copy_to(doutputs[t], dY_list[self.recurrence_depth - 1][t]) for i in range(self.recurrence_depth - 1, -1, -1): if i == 0: _h.mult_tt(dY_list[i][t], T_list[i][t], dH_list[i][t]) tmp = _h.ones(dH_list[i][t].shape) _h.subtract_tt(H_list[i][t], outputs[t - 1], tmp) _h.mult_tt(dY_list[i][t], tmp, dT_list[i][t]) _h.inplace_act_func_deriv['sigmoid'](T_list[i][t], dT_list[i][t]) _h.inplace_act_func_deriv[self.activation](H_list[i][t], dH_list[i][t]) else: _h.mult_tt(dY_list[i][t], T_list[i][t], dH_list[i][t]) tmp = _h.ones(dH_list[i][t].shape) _h.subtract_tt(tmp, T_list[i][t], tmp) _h.mult_tt(dY_list[i][t], tmp, dY_list[i - 1][t]) _h.subtract_tt(H_list[i][t], Y_list[i - 1][t], tmp) _h.mult_tt(dY_list[i][t], tmp, dT_list[i][t]) _h.inplace_act_func_deriv['sigmoid'](T_list[i][t], dT_list[i][t]) _h.inplace_act_func_deriv[self.activation](H_list[i][t], dH_list[i][t]) _h.dot_add_mm(dT_list[i][t], R_T[i], dY_list[i - 1][t]) _h.dot_add_mm(dH_list[i][t], R_H[i], dY_list[i - 1][t]) flat_inputs = flatten_time_and_features(inputs) flat_dinputs = flatten_time_and_features(dinputs) flat_dH = flatten_time(dH_list[0][:-1]) flat_dT = flatten_time(dT_list[0][:-1]) # calculate in_deltas and gradients _h.dot_add_mm(flat_dH, W_H, flat_dinputs) _h.dot_add_mm(flat_dH, flat_inputs, dW_H, transa=True) _h.dot_add_mm(flat_dT, W_T, flat_dinputs) _h.dot_add_mm(flat_dT, flat_inputs, dW_T, transa=True) for i in range(self.recurrence_depth): dbias_tmp = _h.allocate(dbias_H[i].shape) flat_dH = flatten_time(dH_list[i][:-1]) flat_dT = flatten_time(dT_list[i][:-1]) _h.sum_t(flat_dT, axis=0, out=dbias_tmp) _h.add_tt(dbias_T[i], dbias_tmp, dbias_T[i]) _h.sum_t(flat_dH, axis=0, out=dbias_tmp) _h.add_tt(dbias_H[i], dbias_tmp, dbias_H[i]) for i in range(self.recurrence_depth): if i == 0: flat_outputs = flatten_time(outputs[:-2]) flat_dH = flatten_time(dH_list[i][1:-1]) flat_dT = flatten_time(dT_list[i][1:-1]) _h.dot_add_mm(flat_dT, flat_outputs, dR_T[i], transa=True) _h.dot_add_mm(dT_list[i][0], outputs[-1], dR_T[i], transa=True) _h.dot_add_mm(flat_dH, flat_outputs, dR_H[i], transa=True) _h.dot_add_mm(dH_list[i][0], outputs[-1], dR_H[i], transa=True) else: flat_outputs = flatten_time(Y_list[i - 1][:-1]) flat_dH = flatten_time(dH_list[i][:-1]) flat_dT = flatten_time(dT_list[i][:-1]) _h.dot_add_mm(flat_dT, flat_outputs, dR_T[i], transa=True) _h.dot_add_mm(flat_dH, flat_outputs, dR_H[i], transa=True)
class BatchNormLayerImpl(Layer): expected_inputs = {'default': StructureTemplate('T', 'B', '...')} expected_kwargs = {'decay', 'epsilon'} def setup(self, kwargs, in_shapes): self.epsilon = kwargs.get('epsilon', 1.0e-5) self.decay = kwargs.get('decay', 0.9) assert 0.0 <= self.decay <= 1.0, "Decay must be between 0 and 1." outputs = OrderedDict() outputs['default'] = in_shapes['default'] parameters = OrderedDict() buf = BufferStructure(self.in_shapes['default'].feature_shape[-1]) parameters['gamma'] = buf parameters['beta'] = buf parameters['mu'] = buf parameters['sigma'] = buf internals = OrderedDict() internals['sigma_b'] = buf internals['centered'] = self.in_shapes['default'] internals['x_hat'] = self.in_shapes['default'] return outputs, parameters, internals def forward_pass(self, buffers, training_pass=True): _h = self.handler sigma_b, centered, x_hat = buffers.internals gamma, beta, mu, sigma = buffers.parameters # Note: we flatten time for all buffers, so we skip the flat_ prefix inputs = flatten_all_but_last(buffers.inputs.default) centered = flatten_all_but_last(centered) x_hat = flatten_all_but_last(x_hat) out = flatten_all_but_last(buffers.outputs.default) m = inputs.shape[0] if training_pass: mu_b = sigma_b # temporary use this with other name # Calculate the (negative) batch mean _h.sum_t(inputs, 0, mu_b) _h.mult_st(-1.0 / m, mu_b, mu_b) # Adjust mu as an exponential moving average # TODO: Find better way _h.mult_st(self.decay, mu, mu) _h.mult_add_st(1.0 - self.decay, mu_b, mu) mu = mu_b # Calculate the centered activations _h.add_mv(inputs, mu.reshape((1, mu.size)), centered) if training_pass: sigma2 = sigma_b # temporary use this with other name centered2 = x_hat # temporary use this with other name # Calculate the variance _h.mult_tt(centered, centered, centered2) _h.sum_t(centered2, 0, sigma2) _h.mult_st(1.0 / m, sigma2, sigma2) # TODO m-1 instead? _h.add_st(self.epsilon, sigma2, sigma2) # (numerically stabilized) # Standard deviation _h.sqrt_t(sigma2, sigma_b) # Adjust sigma as an exponential moving sigma # FIXME: This is clearly a hack and wrong _h.mult_st(self.decay, sigma, sigma) _h.mult_add_st(1.0 - self.decay, sigma_b, sigma) sigma = sigma_b # compute normalized inputs _h.divide_mv(centered, sigma.reshape((1, sigma.size)), x_hat) # Compute outputs _h.mult_mv(x_hat, gamma.reshape((1, gamma.size)), out) _h.add_mv(out, beta.reshape((1, beta.size)), out) def backward_pass(self, buffers): _h = self.handler sigma_b, centered, x_hat = buffers.internals gamma = buffers.parameters.gamma dgamma = buffers.gradients.gamma dbeta = buffers.gradients.beta # Note: we flatten time for all buffers, so we skip the flat_ prefix x_hat = flatten_all_but_last(x_hat) outdeltas = flatten_all_but_last(buffers.output_deltas.default) indeltas = flatten_all_but_last(buffers.input_deltas.default) m = outdeltas.shape[0] big_tmp = _h.allocate(x_hat.shape) # big small_tmp = _h.allocate(gamma.shape) # small # ------------- Gradients --------------- # Calculate dgamma tmp = big_tmp dgamma_tmp = small_tmp _h.mult_tt(outdeltas, x_hat, tmp) _h.sum_t(tmp, axis=0, out=dgamma_tmp) _h.add_tt(dgamma_tmp, dgamma, dgamma) _h.mult_st(1 / m, dgamma_tmp, dgamma_tmp) term1 = big_tmp _h.mult_mv(x_hat, dgamma_tmp.reshape((1, gamma.size)), term1) # Calculate dbeta dbeta_tmp = small_tmp _h.sum_t(outdeltas, axis=0, out=dbeta_tmp) _h.add_tt(dbeta_tmp, dbeta, dbeta) _h.mult_st(1 / m, dbeta_tmp, dbeta_tmp) # ------------- Deltas --------------- term2 = big_tmp term3 = big_tmp _h.subtract_tt(outdeltas, term1, term2) _h.subtract_mv(term2, dbeta_tmp.reshape((1, dbeta.size)), term3) # get normalization factor (gamma / sigma_b) coeff = small_tmp _h.divide_tt(gamma, sigma_b, coeff) term4 = big_tmp _h.mult_mv(term3, coeff.reshape((1, coeff.size)), term4) _h.add_tt(term4, indeltas, indeltas)
class RecurrentLayerImpl(Layer): expected_inputs = {'default': StructureTemplate('T', 'B', '...')} expected_kwargs = {'size', 'activation'} def setup(self, kwargs, in_shapes): self.activation = kwargs.get('activation', 'tanh') self.size = kwargs.get('size', self.in_shapes['default'].feature_size) if not isinstance(self.size, int): raise LayerValidationError('size must be int but was {}'.format( self.size)) in_size = self.in_shapes['default'].feature_size outputs = OrderedDict() outputs['default'] = BufferStructure('T', 'B', self.size, context_size=1) parameters = OrderedDict() parameters['W'] = BufferStructure(self.size, in_size) parameters['R'] = BufferStructure(self.size, self.size) parameters['bias'] = BufferStructure(self.size) internals = OrderedDict() internals['Ha'] = BufferStructure('T', 'B', self.size, context_size=1) internals['dHa'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dHb'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) return outputs, parameters, internals def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler W, R, bias = buffers.parameters inputs = buffers.inputs.default outputs = buffers.outputs.default Ha = buffers.internals.Ha flat_inputs = flatten_time_and_features(inputs) flat_H = flatten_time(Ha[:-1]) _h.dot_mm(flat_inputs, W, flat_H, transb=True) _h.add_mv(flat_H, bias.reshape((1, self.size)), flat_H) for t in range(inputs.shape[0]): _h.dot_add_mm(outputs[t - 1], R, Ha[t], transb=True) _h.act_func[self.activation](Ha[t], outputs[t]) def backward_pass(self, buffers): # prepare _h = self.handler W, R, bias = buffers.parameters dW, dR, dbias = buffers.gradients inputs = buffers.inputs.default outputs = buffers.outputs.default dinputs = buffers.input_deltas.default doutputs = buffers.output_deltas.default Ha, dHa, dHb = buffers.internals _h.copy_to(doutputs, dHb) T = inputs.shape[0] - 1 _h.act_func_deriv[self.activation](Ha[T], outputs[T], dHb[T], dHa[T]) for t in range(T - 1, -1, -1): _h.dot_add_mm(dHa[t + 1], R, dHb[t]) _h.act_func_deriv[self.activation](Ha[t], outputs[t], dHb[t], dHa[t]) flat_inputs = flatten_time_and_features(inputs) flat_dinputs = flatten_time_and_features(dinputs) flat_dHa = flatten_time(dHa[:-1]) # calculate in_deltas and gradients _h.dot_add_mm(flat_dHa, W, flat_dinputs) _h.dot_add_mm(flat_dHa, flat_inputs, dW, transa=True) dbias_tmp = _h.allocate(dbias.shape) _h.sum_t(flat_dHa, axis=0, out=dbias_tmp) _h.add_tt(dbias, dbias_tmp, dbias) flat_outputs = flatten_time(outputs[:-2]) flat_dHa = flatten_time(dHa[1:-1]) _h.dot_add_mm(flat_dHa, flat_outputs, dR, transa=True) _h.dot_add_mm(dHa[0], outputs[-1], dR, transa=True)
class SoftmaxCELayerImpl(Layer): expected_inputs = { 'default': StructureTemplate('T', 'B', '...'), 'targets': StructureTemplate('T', 'B', '...') } computes_no_input_deltas_for = ['targets'] takes_no_output_deltas_from = ['probabilities'] def setup(self, kwargs, in_shapes): in_shape = in_shapes['default'].feature_shape tar_shape = in_shapes['targets'].feature_shape if len(tar_shape) != len(in_shape): raise LayerValidationError('Default input and targets must have ' 'the same number of dimensions.') if tar_shape[:-1] != in_shape[:-1]: raise LayerValidationError('All dimensions except last must match ' 'for default input and targets.') if tar_shape[-1] != 1: raise LayerValidationError('Last dimension of targets must be ' 'size 1.') outputs = OrderedDict() outputs['probabilities'] = BufferStructure('T', 'B', *in_shape) outputs['loss'] = BufferStructure('T', 'B', *tar_shape) internals = OrderedDict() internals['t_bin'] = BufferStructure('T', 'B', *in_shape, is_backward_only=True) return outputs, OrderedDict(), internals def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler inputs = buffers.inputs.default targets = buffers.inputs.targets probabilities = buffers.outputs.probabilities loss = buffers.outputs.loss # reshape flat_inputs = flatten_all_but_last(inputs) flat_probs = flatten_all_but_last(probabilities) flat_loss = flatten_all_but_last(loss) flat_targets = flatten_all_but_last(targets) # softmax _h.softmax_m(flat_inputs, flat_probs) # the multinomial cross entropy error is given by # - sum over i: p_i * ln(y_i) # now our targets are indices so all p_i = 0 except for i=t _h.fill(loss, 0.) _h.index_m_by_v(flat_probs, flat_targets, flat_loss) _h.clip_t(flat_loss, 1e-6, 1.0, flat_loss) _h.log_t(loss, loss) _h.mult_st(-1, loss, loss) def backward_pass(self, buffers): # prepare _h = self.handler targets = buffers.inputs.targets probs = buffers.outputs.probabilities dinputs = buffers.input_deltas.default dloss = buffers.output_deltas.loss t_bin = buffers.internals.t_bin # reshape flat_probs = flatten_all_but_last(probs) flat_targets = flatten_all_but_last(targets) flat_t_bin = flatten_all_but_last(t_bin) flat_dloss = flatten_all_but_last(dloss) flat_dinputs = flatten_all_but_last(dinputs) # derivative of multinomial cross-entropy error wrt softmax: # y - t _h.binarize_v(flat_targets, flat_t_bin) _h.mult_st(-1, flat_t_bin, flat_t_bin) _h.add_tt(flat_t_bin, flat_probs, flat_t_bin) _h.mult_mv(flat_t_bin, flat_dloss, flat_t_bin) _h.add_tt(flat_t_bin, flat_dinputs, flat_dinputs)
def test_structure_template_matches3(shape, expected): st = StructureTemplate('T', 'B', '...') struct = BufferStructure(*shape) assert st.matches(struct) == expected
class SoftmaxFiddleLayerImpl(Layer): expected_inputs = {'default': StructureTemplate('T', 'B', '...'), 'targets': StructureTemplate('T', 'B', '...')} computes_no_input_deltas_for = ['targets'] takes_no_output_deltas_from = ['predictions'] def setup(self, kwargs, in_shapes): in_shape = in_shapes['default'].feature_shape tar_shape = in_shapes['targets'].feature_shape if len(tar_shape) != len(in_shape): raise LayerValidationError('Default input and targets must have ' 'the same number of dimensions.') if tar_shape != in_shape: raise LayerValidationError('All dimensions must match ' 'for default input and targets.') outputs = OrderedDict() outputs['predictions'] = BufferStructure('T', 'B', *in_shape) outputs['loss'] = BufferStructure('T', 'B', *in_shape) internals = OrderedDict() internals['dcee'] = BufferStructure('T', 'B', *in_shape, is_backward_only=True) return outputs, OrderedDict(), internals def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler inputs = buffers.inputs.default targets = buffers.inputs.targets predictions = buffers.outputs.predictions loss = buffers.outputs.loss # reshape flat_inputs = flatten_all_but_last(inputs) flat_probs = flatten_all_but_last(predictions) flat_loss = flatten_all_but_last(loss) flat_targets = flatten_all_but_last(targets) # softmax _h.softmax_m(flat_inputs, flat_probs) # the multinomial cross entropy error is given by # - sum over i: p_i * ln(y_i) _h.copy_to(flat_probs, flat_loss) _h.clip_t(flat_loss, 1e-6, 1.0, flat_loss) _h.log_t(flat_loss, flat_loss) _h.mult_tt(flat_loss, flat_targets, flat_loss) _h.mult_st(-1, loss, loss) def backward_pass(self, buffers): # prepare _h = self.handler targets = flatten_time_and_features(buffers.inputs.targets) probs = flatten_time_and_features(buffers.outputs.predictions) dinputs = flatten_time_and_features(buffers.input_deltas.default) dloss = flatten_time_and_features(buffers.output_deltas.loss) dcee = flatten_time_and_features(buffers.internals.dcee) # derivative of multinomial cross-entropy error wrt softmax: # y - t _h.subtract_tt(probs, targets, dcee) # y - t _h.mult_mv(dcee, dloss, dcee) # out_delta * (y - t) _h.add_tt(dcee, dinputs, dinputs)
def test_structure_template_matches1(shape, expected): st = StructureTemplate('T', 'B', 1, 3) assert st.matches(BufferStructure(*shape)) == expected
class SquaredDifferenceLayerImpl(Layer): expected_inputs = { 'inputs_1': StructureTemplate('T', 'B', '...'), 'inputs_2': StructureTemplate('T', 'B', '...') } expected_kwargs = {} def setup(self, kwargs, in_shapes): # 'inputs_1' and 'inputs_2' must have same shape f_size1 = in_shapes['inputs_1'].feature_size f_size2 = in_shapes['inputs_2'].feature_size if f_size1 != f_size2: raise LayerValidationError( "{}: inputs_1 and inputs_2 must have same feature sizes but " "got {} and {}".format(self.name, in_shapes['inputs_1'].feature_shape, in_shapes['inputs_2'].feature_shape)) outputs = OrderedDict() outputs['default'] = BufferStructure('T', 'B', 1) internals = OrderedDict() feature_size = self.in_shapes['inputs_1'].feature_size internals['squared_diff'] = BufferStructure('T', 'B', feature_size) internals['grad_diff'] = BufferStructure('T', 'B', feature_size, is_backward_only=True) return outputs, OrderedDict(), internals def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler inputs_1 = flatten_time_and_features(buffers.inputs.inputs_1) inputs_2 = flatten_time_and_features(buffers.inputs.inputs_2) diff = flatten_time_and_features(buffers.internals.squared_diff) diff_sum = flatten_time(buffers.outputs.default) # calculate _h.subtract_tt(inputs_1, inputs_2, out=diff) _h.mult_tt(diff, diff, out=diff) _h.sum_t(diff, axis=1, out=diff_sum) _h.mult_st(0.5, diff_sum, out=diff_sum) def backward_pass(self, buffers): # prepare _h = self.handler inputs_1 = flatten_time_and_features(buffers.inputs.inputs_1) inputs_2 = flatten_time_and_features(buffers.inputs.inputs_2) out_deltas = buffers.output_deltas.default grad_diff = buffers.internals.grad_diff dinputs_1 = flatten_time_and_features(buffers.input_deltas.inputs_1) dinputs_2 = flatten_time_and_features(buffers.input_deltas.inputs_2) tmp = _h.allocate(inputs_2.shape) # out_deltas has only one feature dimension due to summation, # so we broadcast to all feature dimensions _h.broadcast_t(out_deltas, 2, grad_diff) grad_diff = flatten_time(grad_diff) # calculate _h.subtract_tt(inputs_1, inputs_2, out=tmp) _h.mult_add_tt(grad_diff, tmp, dinputs_1) _h.subtract_tt(inputs_2, inputs_1, out=tmp) _h.mult_add_tt(grad_diff, tmp, dinputs_2)
class ClockworkLstmLayerImpl(Layer): expected_kwargs = {'size', 'activation'} expected_inputs = {'default': StructureTemplate('T', 'B', '...')} computes_no_gradients_for = ['timing'] def setup(self, kwargs, in_shapes): self.activation = kwargs.get('activation', 'tanh') self.size = kwargs.get('size', in_shapes['default'].feature_size) if not isinstance(self.size, int): raise LayerValidationError('size must be int but was {}'. format(self.size)) in_size = in_shapes['default'].feature_size outputs = OrderedDict() outputs['default'] = BufferStructure('T', 'B', self.size, context_size=1) parameters = OrderedDict() parameters['Wz'] = BufferStructure(self.size, in_size) parameters['Wi'] = BufferStructure(self.size, in_size) parameters['Wf'] = BufferStructure(self.size, in_size) parameters['Wo'] = BufferStructure(self.size, in_size) parameters['pi'] = BufferStructure(1, self.size) parameters['pf'] = BufferStructure(1, self.size) parameters['po'] = BufferStructure(1, self.size) parameters['Rz'] = BufferStructure(self.size, self.size) parameters['Ri'] = BufferStructure(self.size, self.size) parameters['Rf'] = BufferStructure(self.size, self.size) parameters['Ro'] = BufferStructure(self.size, self.size) parameters['bz'] = BufferStructure(self.size) parameters['bi'] = BufferStructure(self.size) parameters['bf'] = BufferStructure(self.size) parameters['bo'] = BufferStructure(self.size) parameters['timing'] = BufferStructure(self.size) internals = OrderedDict() internals['Za'] = BufferStructure('T', 'B', self.size, context_size=1) internals['Zb'] = BufferStructure('T', 'B', self.size, context_size=1) internals['Ia'] = BufferStructure('T', 'B', self.size, context_size=1) internals['Ib'] = BufferStructure('T', 'B', self.size, context_size=1) internals['Fa'] = BufferStructure('T', 'B', self.size, context_size=1) internals['Fb'] = BufferStructure('T', 'B', self.size, context_size=1) internals['Oa'] = BufferStructure('T', 'B', self.size, context_size=1) internals['Ob'] = BufferStructure('T', 'B', self.size, context_size=1) internals['Ca'] = BufferStructure('T', 'B', self.size, context_size=1) internals['Cb'] = BufferStructure('T', 'B', self.size, context_size=1) internals['dZa'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dZb'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dIa'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dIb'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dFa'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dFb'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dOa'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dOb'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dCa'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) internals['dCb'] = BufferStructure('T', 'B', self.size, context_size=1, is_backward_only=True) return outputs, parameters, internals def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler (Wz, Wi, Wf, Wo, pi, pf, po, Rz, Ri, Rf, Ro, bz, bi, bf, bo, timing) = buffers.parameters (Za, Zb, Ia, Ib, Fa, Fb, Oa, Ob, Ca, Cb, dZa, dZb, dIa, dIb, dFa, dFb, dOa, dOb, dCa, dCb) = buffers.internals x = buffers.inputs.default y = buffers.outputs.default time_size, batch_size = x.shape[0], x.shape[1] # Temporary variable to be filled with the current value of time t tmp = _h.zeros(timing.shape) cond = _h.zeros(y[0].shape) flat_x = flatten_time_and_features(x) flat_Za = flatten_time(Za[:-1]) flat_Ia = flatten_time(Ia[:-1]) flat_Fa = flatten_time(Fa[:-1]) flat_Oa = flatten_time(Oa[:-1]) _h.dot_mm(flat_x, Wz, flat_Za, transb=True) _h.dot_mm(flat_x, Wi, flat_Ia, transb=True) _h.dot_mm(flat_x, Wf, flat_Fa, transb=True) _h.dot_mm(flat_x, Wo, flat_Oa, transb=True) for t in range(time_size): # Block input _h.dot_add_mm(y[t - 1], Rz, Za[t], transb=True) _h.add_mv(Za[t], bz.reshape((1, self.size)), Za[t]) _h.act_func[self.activation](Za[t], Zb[t]) # Input Gate _h.dot_add_mm(y[t - 1], Ri, Ia[t], transb=True) _h.mult_add_mv(Ca[t - 1], pi, Ia[t]) # ADDED PEEPHOLE CONNECTION _h.add_mv(Ia[t], bi.reshape((1, self.size)), Ia[t]) _h.sigmoid(Ia[t], Ib[t]) # Forget Gate _h.dot_add_mm(y[t - 1], Rf, Fa[t], transb=True) _h.mult_add_mv(Ca[t - 1], pf, Fa[t]) # ADDED PEEPHOLE CONNECTION _h.add_mv(Fa[t], bf.reshape((1, self.size)), Fa[t]) _h.sigmoid(Fa[t], Fb[t]) # Cell _h.mult_tt(Ib[t], Zb[t], Ca[t]) _h.mult_add_tt(Fb[t], Ca[t - 1], Ca[t]) # Output Gate _h.dot_add_mm(y[t - 1], Ro, Oa[t], transb=True) _h.mult_add_mv(Ca[t], po, Oa[t]) # ADDED PEEPHOLE CONNECTION _h.add_mv(Oa[t], bo.reshape((1, self.size)), Oa[t]) _h.sigmoid(Oa[t], Ob[t]) # Block output _h.act_func[self.activation](Ca[t], Cb[t]) _h.mult_tt(Ob[t], Cb[t], y[t]) if t > 0: _h.fill(tmp, t) _h.modulo_tt(tmp, timing, tmp) _h.broadcast_t(tmp.reshape((1, tmp.shape[0])), 0, cond) # Reset Cell _h.copy_to_if(Ca[t-1], Ca[t], cond) # Reset Block output _h.copy_to_if(y[t-1], y[t], cond) def backward_pass(self, buffers): # prepare _h = self.handler (dWz, dWi, dWf, dWo, dpi, dpf, dpo, dRz, dRi, dRf, dRo, dbz, dbi, dbf, dbo, dtiming) = buffers.gradients (Wz, Wi, Wf, Wo, pi, pf, po, Rz, Ri, Rf, Ro, bz, bi, bf, bo, timing) = buffers.parameters (Za, Zb, Ia, Ib, Fa, Fb, Oa, Ob, Ca, Cb, dZa, dZb, dIa, dIb, dFa, dFb, dOa, dOb, dCa, dCb) = buffers.internals x = buffers.inputs.default dx = buffers.input_deltas.default y = buffers.outputs.default deltas = buffers.output_deltas.default dy = _h.allocate(y.shape) time_size, batch_size = x.shape[0], x.shape[1] # Temporary variable to be filled with the current value of time t tmp = _h.zeros(timing.shape) _h.fill(dCa, 0.0) cond = _h.zeros(y[0].shape) for t in range(time_size - 1, -1, - 1): # Accumulate recurrent deltas _h.add_tt(dy[t], deltas[t], dy[t]) _h.fill(tmp, t) _h.modulo_tt(tmp, timing, tmp) _h.broadcast_t(tmp.reshape((1, tmp.shape[0])), 0, cond) _h.dot_add_mm(dIa[t + 1], Ri, dy[t]) _h.dot_add_mm(dFa[t + 1], Rf, dy[t]) _h.dot_add_mm(dOa[t + 1], Ro, dy[t]) _h.dot_add_mm(dZa[t + 1], Rz, dy[t]) _h.mult_add_mv(dIa[t + 1], pi, dCa[t]) _h.mult_add_mv(dFa[t + 1], pf, dCa[t]) # Output Gate _h.mult_tt(dy[t], Cb[t], dOb[t]) _h.fill_if(dOb[t], 0, cond) # Set inactive to 0 _h.sigmoid_deriv(Oa[t], Ob[t], dOb[t], dOa[t]) # Output influence on peephole: _h.mult_add_mv(dOa[t], po, dCa[t]) # Cell _h.mult_tt(dy[t], Ob[t], dCb[t]) _h.act_func_deriv[self.activation](Ca[t], Cb[t], dCb[t], dCb[t]) _h.fill_if(dCb[t], 0, cond) _h.add_tt(dCa[t], dCb[t], dCa[t]) _h.mult_add_tt(dCa[t + 1], Fb[t + 1], dCa[t]) # Forget Gate _h.mult_tt(dCa[t], Ca[t - 1], dFb[t]) _h.sigmoid_deriv(Fa[t], Fb[t], dFb[t], dFa[t]) # Input Gate _h.mult_tt(dCa[t], Zb[t], dIb[t]) _h.sigmoid_deriv(Ia[t], Ib[t], dIb[t], dIa[t]) # Block Input _h.mult_tt(dCa[t], Ib[t], dZb[t]) _h.act_func_deriv[self.activation](Za[t], Zb[t], dZb[t], dZa[t]) # Copy over the error from previous inactive nodes _h.add_into_if(dy[t], dy[t-1], cond) _h.add_into_if(dCa[t], dCa[t-1], cond) # Undo updates to inactive nodes: _h.fill_if(dIa[t], 0, cond) _h.fill_if(dFa[t], 0, cond) _h.fill_if(dZa[t], 0, cond) _h.fill_if(Fb[t], 0, cond) # Same as for standard RNN: flat_inputs = flatten_time_and_features(x) flat_dinputs = flatten_time_and_features(dx) flat_dIa = flatten_time(dIa[:-1]) flat_dFa = flatten_time(dFa[:-1]) flat_dOa = flatten_time(dOa[:-1]) flat_dZa = flatten_time(dZa[:-1]) # calculate in_deltas and gradients _h.dot_add_mm(flat_dIa, Wi, flat_dinputs) _h.dot_add_mm(flat_dFa, Wf, flat_dinputs) _h.dot_add_mm(flat_dOa, Wo, flat_dinputs) _h.dot_add_mm(flat_dZa, Wz, flat_dinputs) _h.dot_add_mm(flat_dIa, flat_inputs, dWi, transa=True) _h.dot_add_mm(flat_dFa, flat_inputs, dWf, transa=True) _h.dot_add_mm(flat_dOa, flat_inputs, dWo, transa=True) _h.dot_add_mm(flat_dZa, flat_inputs, dWz, transa=True) dbias_tmp = _h.allocate(dbz.shape) _h.sum_t(flat_dIa, axis=0, out=dbias_tmp) _h.add_tt(dbi, dbias_tmp, dbi) _h.sum_t(flat_dFa, axis=0, out=dbias_tmp) _h.add_tt(dbf, dbias_tmp, dbf) _h.sum_t(flat_dOa, axis=0, out=dbias_tmp) _h.add_tt(dbo, dbias_tmp, dbo) _h.sum_t(flat_dZa, axis=0, out=dbias_tmp) _h.add_tt(dbz, dbias_tmp, dbz) flat_outputs = flatten_time(y[:-2]) flat_cell = flatten_time(Ca[:-2]) flat_cell2 = flatten_time(Ca[:-1]) dWco_tmp = _h.allocate(flat_cell2.shape) dWc_tmp = _h.allocate(dpo.shape) # Peephole connection output weight: _h.mult_tt(flat_cell2, flat_dOa, dWco_tmp) _h.sum_t(dWco_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpo, dWc_tmp, dpo) flat_dIa = flatten_time(dIa[1:-1]) flat_dFa = flatten_time(dFa[1:-1]) flat_dOa = flatten_time(dOa[1:-1]) flat_dZa = flatten_time(dZa[1:-1]) _h.dot_add_mm(flat_dIa, flat_outputs, dRi, transa=True) _h.dot_add_mm(flat_dFa, flat_outputs, dRf, transa=True) _h.dot_add_mm(flat_dOa, flat_outputs, dRo, transa=True) _h.dot_add_mm(flat_dZa, flat_outputs, dRz, transa=True) _h.dot_add_mm(dIa[0], dy[-1], dRi, transa=True) _h.dot_add_mm(dFa[0], dy[-1], dRf, transa=True) _h.dot_add_mm(dOa[0], dy[-1], dRo, transa=True) _h.dot_add_mm(dZa[0], dy[-1], dRz, transa=True) # Other Peephole connections dWcif_tmp = _h.allocate(flat_cell.shape) _h.mult_tt(flat_cell, flat_dIa, dWcif_tmp) _h.sum_t(dWcif_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpi, dWc_tmp, dpi) _h.mult_tt(flat_cell, flat_dFa, dWcif_tmp) _h.sum_t(dWcif_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpf, dWc_tmp, dpf) dWcif_tmp = _h.allocate(dIa[0].shape) _h.mult_tt(dCa[-1], dIa[0], dWcif_tmp) _h.sum_t(dWcif_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpi, dWc_tmp, dpi) _h.mult_tt(dCa[-1], dIa[0], dWcif_tmp) _h.sum_t(dWcif_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpf, dWc_tmp, dpf)
class SigmoidCELayerImpl(Layer): expected_inputs = {'default': StructureTemplate('T', 'B', '...'), 'targets': StructureTemplate('T', 'B', '...')} computes_no_input_deltas_for = ['targets'] takes_no_output_deltas_from = ['predictions'] def setup(self, kwargs, in_shapes): in_shape = in_shapes['default'].feature_shape tar_shape = in_shapes['targets'].feature_shape if tar_shape != in_shape: raise LayerValidationError('input and targets must have the same ' 'shapes. But got {} != {}' .format(in_shape, tar_shape)) outputs = OrderedDict() outputs['predictions'] = BufferStructure('T', 'B', *in_shape) outputs['loss'] = BufferStructure('T', 'B', *in_shape) internals = OrderedDict() internals['dcee'] = BufferStructure('T', 'B', *in_shape, is_backward_only=True) return outputs, OrderedDict(), internals def forward_pass(self, buffers, training_pass=True): _h = self.handler assert isinstance(_h, Handler) inputs = flatten_time_and_features(buffers.inputs.default) targets = flatten_time_and_features(buffers.inputs.targets) loss = flatten_time_and_features(buffers.outputs.loss) prob = flatten_time_and_features(buffers.outputs.predictions) # Apply sigmoid _h.sigmoid(inputs, prob) # the binomial cross entropy error is given by # - (t * ln(y) + (1-t) * ln(1-y)) tmp = _h.ones(prob.shape) _h.subtract_tt(tmp, prob, loss) # loss = 1-y _h.subtract_tt(tmp, targets, tmp) # tmp = 1-t _h.clip_t(loss, 1e-6, 1.0, loss) _h.log_t(loss, loss) # loss = ln(1-y) _h.mult_tt(tmp, loss, tmp) # tmp = (1-t) * ln(1-y) _h.clip_t(prob, 1e-6, 1.0, loss) _h.log_t(loss, loss) # loss = ln(y) _h.mult_tt(targets, loss, loss) # loss = t * ln(y) _h.add_tt(tmp, loss, loss) # loss = (1-t) * ln(1-y) + t * ln(y) _h.mult_st(-1, loss, loss) # * -1 def backward_pass(self, buffers): # prepare _h = self.handler assert isinstance(_h, Handler) dinputs = flatten_time_and_features(buffers.input_deltas.default) dloss = flatten_time_and_features(buffers.output_deltas.loss) dcee = flatten_time_and_features(buffers.internals.dcee) targets = flatten_time_and_features(buffers.inputs.targets) prob = flatten_time_and_features(buffers.outputs.predictions) _h.subtract_tt(prob, targets, dcee) # y - t _h.mult_mv(dcee, dloss, dcee) # out_delta * (y - t) _h.add_tt(dcee, dinputs, dinputs)
class Convolution2DLayerImpl(Layer): expected_inputs = {'default': StructureTemplate('T', 'B', '...')} expected_kwargs = { 'num_filters', 'kernel_size', 'stride', 'padding', 'activation' } def setup(self, kwargs, in_shapes): self.activation = kwargs.get('activation', 'tanh') assert 'num_filters' in kwargs, "num_filters must be specified " \ " for ConvolutionLayer" assert 'kernel_size' in kwargs, "kernel_size must be specified " \ "for ConvolutionLayer" self.num_filters = kwargs['num_filters'] self.kernel_size = kwargs['kernel_size'] self.stride = tuple(kwargs.get('stride', (1, 1))) self.padding = kwargs.get('padding', 0) assert type(self.padding) is int and self.padding >= 0, \ "Invalid padding: {}".format(self.padding) assert type(self.kernel_size) in [list, tuple] and \ len(self.kernel_size) == 2, "Kernel size must be list or " \ "tuple of length 2: {}".format( self.kernel_size) assert type(self.stride) in [list, tuple] and len(self.stride) == 2, \ "Stride must be list or tuple of length 2: {}".format(self.stride) in_shape = self.in_shapes['default'].feature_shape assert self.stride[0] >= 0 and self.stride[1] >= 0, \ "Invalid stride: {}".format(self.stride) assert isinstance(in_shape, tuple) and len(in_shape) == 3, \ "ConvolutionLayer2D must have 3 dimensional input but input " \ "shape was {}".format(in_shape) num_input_maps = in_shape[2] num_filters = self.num_filters kernel_x, kernel_y = self.kernel_size padding, stride = self.padding, self.stride output_height = ( (in_shape[0] + 2 * padding - kernel_x) // stride[0]) + 1 output_width = ( (in_shape[1] + 2 * padding - kernel_y) // stride[1]) + 1 out_shape = (output_height, output_width, num_filters) outputs = OrderedDict() outputs['default'] = BufferStructure('T', 'B', *out_shape) parameters = OrderedDict() parameters['W'] = BufferStructure(num_filters, kernel_x, kernel_y, num_input_maps) parameters['bias'] = BufferStructure(num_filters) internals = OrderedDict() return outputs, parameters, internals def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler W, bias = buffers.parameters inputs = buffers.inputs.default outputs = buffers.outputs.default # reshape flat_inputs = flatten_time(inputs) flat_outputs = flatten_time(outputs) # calculate outputs _h.conv2d_forward_batch(flat_inputs, W, bias, flat_outputs, self.padding, self.stride) _h.inplace_act_func[self.activation](outputs) def backward_pass(self, buffers): # prepare _h = self.handler W, bias = buffers.parameters dW, dbias = buffers.gradients inputs = buffers.inputs.default outputs = buffers.outputs.default in_deltas = buffers.input_deltas.default out_deltas = buffers.output_deltas.default # reshape flat_inputs = flatten_time(inputs) flat_in_deltas = flatten_time(in_deltas) flat_out_deltas = flatten_time(out_deltas) # calculate in_deltas and gradients _h.inplace_act_func_deriv[self.activation](outputs, out_deltas) _h.conv2d_backward_batch(flat_inputs, W, self.padding, self.stride, flat_in_deltas, flat_out_deltas, dW, dbias)