def Pooling(op, # PoolingType_Max or _Average filter_shape, # e.g. (3,3) strides=1, pad=False): x = Placeholder(name='pooling_arg') apply_x = pooling (x, op, filter_shape, strides=_as_tuple(strides), auto_padding=_as_tuple(pad)) if op == PoolingType_Average: op_name = 'AveragePooling' elif op == PoolingType_Max: op_name = 'MaxPooling' else: raise ValueError('Pooling: op must be PoolingType_Max or PoolingType_average') return Block(apply_x, op_name)
def Convolution(filter_shape, # e.g. (3,3) num_filters=None, # e.g. 64 or None (which means 1 channel and don't add a dimension_ activation=activation_default_or_None, init=init_default_or_glorot_uniform, pad=pad_default_or_False, strides=1, sharing=True, # (must be True currently) bias=bias_default_or_True, init_bias=init_bias_default_or_0, reduction_rank=1, # (must be 1 currently) transpose=False, # (must be False currently) max_temp_mem_size_in_samples=0): #UntestedBranchError("Convolution") activation = _resolve_activation(activation) pad = pad if _is_given(pad ) else _current_default_options.pad bias = bias if _is_given(bias) else _current_default_options.bias # TODO: there must be a Python trick to do this as a function call on locals or so if reduction_rank != 1: NotImplementedError("Convolution: reduction_rank other than 1 currently not supported") if transpose: NotImplementedError("Convolution: transpose option currently not supported") if not sharing: NotImplementedError("Convolution: sharing option currently must be True") output_channels_shape = _as_tuple(num_filters) output_rank = len(output_channels_shape) filter_rank = len(filter_shape) kernel_shape = _INFERRED * reduction_rank + filter_shape # kernel := filter plus reductionDims # parameters bound to this Function #init_kernel = glorot_uniform(filter_rank=-filter_rank, output_rank=1) init_kernel = _initializer_for(init, Record(filter_rank=filter_rank, output_rank=-1)) # BUGBUG: It is very confusing that output_rank is negative, esp. since that means count from the start. Solution: add a flag W = Parameter(output_channels_shape + kernel_shape, init=init_kernel, name='W') # (K, C, H, W) aka [ W x H x C x K ] b = Parameter(output_channels_shape + (1,) * len(filter_shape), init=init_bias, name='b') if bias else None # (K, 1, 1) aka [ 1 x 1 x K ] # expression x = Placeholder(name='convolution_arg') # TODO: update the parameter order of convolution() to match the optional ones as in here? (options order matches Keras) apply_x = convolution (W, x, strides=_as_tuple(strides), sharing=_as_tuple(sharing), auto_padding=_as_tuple(pad), # TODO: can we rename auto_padding to pad? transpose=transpose, max_temp_mem_size_in_samples=max_temp_mem_size_in_samples) if bias: apply_x = apply_x + b apply_x = apply_x >> activation return Block(apply_x, 'Convolution', Record(W=W, b=b))
def Embedding(shape=None, init=None, weights=None): if init is not None or weights is not None: raise ValueError('Embedding: init and weights options are mutually exclusive') # parameters bound to this Function: # no weights given: learn the embedding if weights is None: if shape is None: raise ValueError('Embedding: output shape must be specified') if init is None: init = init_default_or_glorot_uniform shape = _as_tuple(shape) weight_shape = _INFERRED + shape E = Parameter(weight_shape, init=init, name='E') # weights given: use them as constant else: UntestedBranchError("Embedding, from constant") import numpy as np if not isinstance(weights, array): # TODO: is this the correct test for a numpy array UntestedBranchError("Embedding, from constant that is not an array") # TODO: can 'weights' be a CNTK object? Then how to do this? raise ValueError('Embedding: weights must be a numpy array') weight_shape = np.shape(weights) if shape is not None: # user may give shape, then it must match if len(shape) >= len(weight_shape) or weight_shape[-len(shape):] != shape: raise ValueError('Embedding: shape parameter must match weights') E = Constant(weights, name='E') # expression x = Placeholder(name='embedding_arg') apply_x = times(x, E) return Block(apply_x, 'Embedding', Record(E=E))
def Linear(shape, _inf, bias=True, init=_default_initializer, init_bias=0, input_rank=None, map_rank=None): out_shape = _as_tuple(shape) # TODO: implement the full semantics of the BrainScript code #inputShape = # if BS.Constants.IsNone (inputRank) then Inferred # not given: one Inferred, which will get expanded # else if !BS.Constants.IsNone (mapRank) then Fail ("'inputRank' and 'mapRank' cannot be specified at the same time.") # else Repeat (inputRank, Inferred) #W = ParameterTensor {_ConcatArrays (outDim, inputShape), init=init, initValueScale=initValueScale} #b = ParameterTensor {outDim, initValue=0} #outputRank = Length (_AsArray (outDim)) # support outputs with tensor layouts #inferInputRankToMap = # if !BS.Constants.IsNone (inputRank) then -1 # means not specified # else if BS.Constants.IsNone (mapRank) then 0 # default to 'use all input dims' # else mapRank #apply (x) = # if bias # then Times (W, x, outputRank=outputRank, inferInputRankToMap=inferInputRankToMap) + b # else Times (W, x, outputRank=outputRank, inferInputRankToMap=inferInputRankToMap) W = Parameter(_inf.shape + out_shape, init=init , name='W') b = Parameter( out_shape, init=init_bias, name='b') if bias else None x = Placeholder(_inf=_inf, name='linear_arg') apply_x = Function.__matmul__(x, W) + b if bias else \ Function.__matmul__(x, W) _name_and_extend_Function(apply_x, 'Linear') return apply_x
def Recurrence(over, go_backwards=False, initial_state=initial_state_default_or_None): # helper to compute previous value # can take a single Variable/Function or a tuple initial_state = initial_state if _is_given(initial_state) else _current_default_options.initial_state # if initial state is given and a numeric constant, then turn it into a Constant() object if np.isscalar(initial_state): initial_state = Constant(initial_state, shape=(1)) # TODO: This should be automatically done inside the API. def previous_hook(state): if isinstance (state, tuple): # if multiple then apply to each element return tuple([previous_hook(s) for s in state]) # not a tuple: must be a 'scalar', i.e. a single element return past_value (state, initial_state) if not go_backwards else \ future_value(state, initial_state) x = Placeholder(name='recurrence_arg') state_forward = over.create_placeholder() # create a placeholder or a tuple of placeholders prev_state = previous_hook(state_forward) # delay (h, c) f_x_h_c = over(x, prev_state) # apply the recurrent over # this returns a Function (x, (h_prev, c_prev)) -> (h, c) h_c = f_x_h_c.outputs replacements = { value_forward: value for (value_forward, value) in zip(list(_as_tuple(state_forward)), h_c) } f_x_h_c.replace_placeholders(replacements) # resolves state_forward := h_c h = f_x_h_c.outputs[0] # 'h' is a Variable (the output of a Function that computed it) if _trace_layers: _log_node(h) _log_node(combine([h.owner])) apply_x = combine([h]) # the Function that yielded 'h', so we get to know its inputs # apply_x is a Function x -> h return Block(apply_x, 'Recurrence', Record(over=over))
def Placeholder(_inf, name='placeholder'): p = placeholder_variable(shape=_as_tuple(_inf.shape), dynamic_axes=_inf.axis, name=name) _name_node(p, name) if _trace_layers: print("new " + _node_description(p)) return p
def Dense(shape, init=init_default_or_glorot_uniform, activation=activation_default_or_None, input_rank=None, map_rank=None, bias=bias_default_or_True, init_bias=init_bias_default_or_0): activation = _resolve_activation(activation) bias = bias if _is_given(bias) else _current_default_options.bias output_shape = _as_tuple(shape) if input_rank is not None and map_rank is not None: raise ValueError( "Dense: input_rank and map_rank cannot be specified at the same time." ) # determine meaning of axes # W gets dimension (input_shape + shape) # where input_shape is determined as: # - by default, equal to the dimensions of the input passed to Dense() # - if input_rank is given, then the last 'input_rank' dimensions of the input (all others are not reduced over) # - if map_rank is given, then the all but the first 'map_rank' dimensions of the input (those are not reduced over) # where input_rank and map_rank are mutuallly exclusive. #output_rank = -len(output_shape) # support outputs with tensor layouts # BUGBUG: Should this be a negative number now, since output is the last axis in Python? output_rank = len(output_shape) # support outputs with tensor layouts # If input_rank not given then pass a single _INFERRED; map_rank if given will determine the input_rank. # The dimension inference may still create multiple axes. input_shape = _INFERRED * (input_rank if input_rank is not None else 1) if input_rank is not None: UntestedBranchError("Dense, input_rank option not implemented") infer_input_rank_to_map = -1 # means map_rank is not specified; input_rank rules elif map_rank is None: infer_input_rank_to_map = 0 # neither given: default to 'infer W to use all input dims' else: UntestedBranchError("Dense, map_rank option not implemented") infer_input_rank_to_map = map_rank # infer W to use all input dims except the first static 'map_rank' ones # parameters bound to this Function init_weights = _initializer_for(init, Record(output_rank=output_rank)) W = Parameter(input_shape + output_shape, init=init_weights, name='W') b = Parameter(output_shape, init=init_bias, name='b') if bias else None # expression of this function x = Placeholder(name='dense_arg') apply_x = times(x, W, output_rank=output_rank, infer_input_rank_to_map=infer_input_rank_to_map) if b: apply_x = apply_x + b apply_x = apply_x >> activation return Block(apply_x, 'Dense', Record(W=W, b=b))
def Embedding(shape, _inf, weights=None, init=_default_initializer, transpose=False): shape = _as_tuple(shape) full_shape = (shape + _inf.shape) if transpose else (_inf.shape + shape) if weights is None: # no weights given: learn the embedding E = Parameter(full_shape, init=init, name='E') else: # weights given: use them as constant UntestedBranchError("Embedding, from constant") E = Constant(full_shape, init=weights, name='E') # TODO: can 'weights' be a CNTK object already? Then how to do this? x = Placeholder(_inf=_inf, name='embedding_arg') apply_x = Function.__matmul__(E, x) if transpose else \ Function.__matmul__(x, E) # x is expected to be sparse one-hot _name_and_extend_Function(apply_x, 'Embedding') return apply_x
def Dense(shape, init=init_default_or_glorot_uniform, activation=activation_default_or_None, input_rank=None, map_rank=None, bias=bias_default_or_True, init_bias=init_bias_default_or_0): activation = _resolve_activation(activation) bias = bias if _is_given(bias) else _current_default_options.bias output_shape = _as_tuple(shape) if input_rank is not None and map_rank is not None: raise ValueError("Dense: input_rank and map_rank cannot be specified at the same time.") # determine meaning of axes # W gets dimension (input_shape + shape) # where input_shape is determined as: # - by default, equal to the dimensions of the input passed to Dense() # - if input_rank is given, then the last 'input_rank' dimensions of the input (all others are not reduced over) # - if map_rank is given, then the all but the first 'map_rank' dimensions of the input (those are not reduced over) # where input_rank and map_rank are mutuallly exclusive. #output_rank = -len(output_shape) # support outputs with tensor layouts # BUGBUG: Should this be a negative number now, since output is the last axis in Python? output_rank = len(output_shape) # support outputs with tensor layouts # If input_rank not given then pass a single _INFERRED; map_rank if given will determine the input_rank. # The dimension inference may still create multiple axes. input_shape = _INFERRED * (input_rank if input_rank is not None else 1) if input_rank is not None: UntestedBranchError("Dense, input_rank option not implemented") infer_input_rank_to_map = -1 # means map_rank is not specified; input_rank rules elif map_rank is None: infer_input_rank_to_map = 0 # neither given: default to 'infer W to use all input dims' else: UntestedBranchError("Dense, map_rank option not implemented") infer_input_rank_to_map = map_rank # infer W to use all input dims except the first static 'map_rank' ones # parameters bound to this Function init_weights = _initializer_for(init, Record(output_rank=output_rank)) W = Parameter(input_shape + output_shape, init=init_weights, name='W') b = Parameter( output_shape, init=init_bias, name='b') if bias else None # expression of this function x = Placeholder(name='dense_arg') apply_x = times(x, W, output_rank=output_rank, infer_input_rank_to_map=infer_input_rank_to_map) if b: apply_x = apply_x + b apply_x = apply_x >> activation return Block(apply_x, 'Dense', Record(W=W, b=b))
def _RecurrentBlock(type, shape, cell_shape, activation, use_peepholes, init, init_bias, enable_self_stabilization, name=''): ''' Helper to create a recurrent block of type 'LSTM', 'GRU', or RNNUnit. ''' has_projection = cell_shape is not None shape = _as_tuple(shape) cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape if len(shape) != 1 or len(cell_shape) != 1: raise ValueError("%s: shape and cell_shape must be vectors (rank-1 tensors)" % type) # otherwise we'd need to fix slicing and Param initializers stack_axis = -1 # for efficient computation, we stack multiple variables (along the fastest-changing one, to match BS) # determine stacking dimensions cell_shape_list = list(cell_shape) stacked_dim = cell_shape_list[stack_axis] cell_shape_list[stack_axis] = stacked_dim * { 'RNNUnit': 1, 'GRU': 3, 'LSTM': 4 }[type] cell_shape_stacked = tuple(cell_shape_list) # patched dims with stack_axis duplicated 4 times cell_shape_list[stack_axis] = stacked_dim * { 'RNNUnit': 1, 'GRU': 2, 'LSTM': 4 }[type] cell_shape_stacked_H = tuple(cell_shape_list) # patched dims with stack_axis duplicated 4 times # parameters b = Parameter( cell_shape_stacked, init=init_bias, name='b') # bias W = Parameter(_INFERRED + cell_shape_stacked, init=init, name='W') # input H = Parameter(shape + cell_shape_stacked_H, init=init, name='H') # hidden-to-hidden H1 = Parameter(shape + cell_shape, init=init, name='H1') if type == 'GRU' else None # hidden-to-hidden Ci = Parameter( cell_shape, init=init, name='Ci') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Cf = Parameter( cell_shape, init=init, name='Cf') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Co = Parameter( cell_shape, init=init, name='Co') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Wmr = Parameter(cell_shape + shape, init=init, name='P') if has_projection else None # final projection # each use of a stabilizer layer must get its own instance Sdh = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dh_stabilizer') Sdc = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dc_stabilizer') Sct = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='c_stabilizer') Sht = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='P_stabilizer') # define the model function itself # general interface for Recurrence(): # (all previous outputs delayed, input) --> (outputs and state) # where # - the first output is the main output, e.g. 'h' for LSTM # - the remaining outputs, if any, are additional state # - if for some reason output != state, then output is still fed back and should just be ignored by the recurrent block # LSTM model function # in this case: # (dh, dc, x) --> (h, c) def lstm(dh, dc, x): dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + times(dhs, H) it_proj = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim) # split along stack_axis bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim) ft_proj = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim) ot_proj = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim) # helper to inject peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid (peep (it_proj, dcs, Ci)) # input gate(t) # TODO: should both activations be replaced? bit = it * activation (bit_proj) # applied to tanh of input network ft = sigmoid (peep (ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid (peep (ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * activation (ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else \ ht # returns the new state as a tuple with names but order matters return (Function.NamedOutput(h=h), Function.NamedOutput(c=c)) # GRU model function # in this case: # (dh, x) --> (h) # e.g. https://en.wikipedia.org/wiki/Gated_recurrent_unit def gru(dh, x): dhs = Sdh(dh) # previous value, stabilized # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias projx3 = b + times(x, W) projh2 = times(dhs, H) zt_proj = slice (projx3, stack_axis, 0*stacked_dim, 1*stacked_dim) + slice (projh2, stack_axis, 0*stacked_dim, 1*stacked_dim) rt_proj = slice (projx3, stack_axis, 1*stacked_dim, 2*stacked_dim) + slice (projh2, stack_axis, 1*stacked_dim, 2*stacked_dim) ct_proj = slice (projx3, stack_axis, 2*stacked_dim, 3*stacked_dim) zt = sigmoid (zt_proj) # update gate z(t) rt = sigmoid (rt_proj) # reset gate r(t) rs = dhs * rt # "cell" c ct = activation (ct_proj + times(rs, H1)) ht = (1 - zt) * ct + zt * dhs # hidden state ht / output # for comparison: CUDNN_GRU # i(t) = sigmoid(W_i x(t) + R_i h(t-1) + b_Wi + b_Ru) # r(t) = sigmoid(W_r x(t) + R_r h(t-1) + b_Wr + b_Rr) --same up to here # h'(t) = tanh(W_h x(t) + r(t) .* (R_h h(t-1)) + b_Wh + b_Rh) --r applied after projection? Would make life easier! # h(t) = (1 - i(t) .* h'(t)) + i(t) .* h(t-1) --TODO: need to confirm bracketing with NVIDIA h = times(Sht(ht), Wmr) if has_projection else \ ht # returns the new state as a tuple with names but order matters return Function.NamedOutput(h=h) def rnn(dh, x): dhs = Sdh(dh) # previous value, stabilized ht = activation (times(x, W) + times(dhs, H) + b) h = times(Sht(ht), Wmr) if has_projection else \ ht return Function.NamedOutput(h=h) function = { 'RNNUnit': rnn, 'GRU': gru, 'LSTM': lstm }[type] # return the corresponding lambda as a CNTK Function return BlockFunction(type, name)(function)
def LSTM(shape, cell_shape=None, use_peepholes=use_peepholes_default_or_False, init=init_default_or_glorot_uniform, init_bias=init_bias_default_or_0, enable_self_stabilization=enable_self_stabilization_default_or_False): # (x, (h, c)) use_peepholes = use_peepholes if _is_given(use_peepholes) else _current_default_options.use_peepholes enable_self_stabilization = enable_self_stabilization if _is_given(enable_self_stabilization) else _current_default_options.enable_self_stabilization has_projection = cell_shape is not None has_aux = False if has_aux: UntestedBranchError("LSTM, has_aux option") shape = _as_tuple(shape) cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape if len(shape) != 1 or len(cell_shape) != 1: raise ValueError("LSTM: shape and cell_shape must be vectors (rank-1 tensors)") # otherwise we'd need to fix slicing and Param initializers stack_axis = -1 # stacking along the fastest-changing one, to match BS # determine stacking dimensions cell_shape_list = list(cell_shape) stacked_dim = cell_shape_list[0] cell_shape_list[stack_axis] = stacked_dim*4 cell_shape_stacked = tuple(cell_shape_list) # patched dims with stack_axis duplicated 4 times # parameters b = Parameter( cell_shape_stacked, init=init_bias, name='b') # a bias W = Parameter(_INFERRED + cell_shape_stacked, init=init, name='W') # input A = Parameter(_INFERRED + cell_shape_stacked, init=init, name='A') if has_aux else None # aux input (optional) H = Parameter(shape + cell_shape_stacked, init=init, name='H') # hidden-to-hidden Ci = Parameter( cell_shape, init=init, name='Ci') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Cf = Parameter( cell_shape, init=init, name='Cf') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Co = Parameter( cell_shape, init=init, name='Co') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Wmr = Parameter(cell_shape + shape, init=init) if has_projection else None # final projection Sdh = Stabilizer() if enable_self_stabilization else identity Sdc = Stabilizer() if enable_self_stabilization else identity Sct = Stabilizer() if enable_self_stabilization else identity Sht = Stabilizer() if enable_self_stabilization else identity def create_hc_placeholder(): # we pass the known dimensions here, which makes dimension inference easier return (Placeholder(shape=shape, name='hPh'), Placeholder(shape=cell_shape, name='cPh')) # (h, c) # parameters to model function x = Placeholder(name='lstm_block_arg') prev_state = create_hc_placeholder() # formula of model function dh, dc = prev_state dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + times(dhs, H) + times(aux, A) if has_aux else \ b + times(x, W) + times(dhs, H) it_proj = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim) # split along stack_axis bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim) ft_proj = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim) ot_proj = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim) # add peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid (peep (it_proj, dcs, Ci)) # input gate(t) bit = it * tanh (bit_proj) # applied to tanh of input network ft = sigmoid (peep (ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid (peep (ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * tanh (ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else \ ht _name_node(h, 'h') if _trace_layers: _log_node(h) # this looks right _name_node(c, 'c') # TODO: figure out how to do scoping, and also rename all the apply... to expression apply_x_h_c = combine ([h, c]) # return to caller a helper function to create placeholders for recurrence # Note that this function will only exist in the object returned here, but not any cloned version of it. apply_x_h_c.create_placeholder = create_hc_placeholder #return Block(apply_x_h_c, 'LSTM') # BUGBUG: fails with "RuntimeError: A Function instance with more than one output cannot be implicitly converted to a Variable" return apply_x_h_c
def _Infer(shape, axis): from cntk.utils import Record, _as_tuple return Record(shape=_as_tuple(shape), axis=axis, with_shape = lambda new_shape: _Infer(new_shape, axis))
def LSTM(shape, _inf, cell_shape=None, use_peepholes=False, init=_default_initializer, init_bias=0, enable_self_stabilization=False): # (x, (h, c)) has_projection = cell_shape is not None has_aux = False if has_aux: UntestedBranchError("LSTM, has_aux option") if enable_self_stabilization: UntestedBranchError("LSTM, enable_self_stabilization option") shape = _as_tuple(shape) cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape #stack_axis = -1 # stack_axis = 0 # BUGBUG: should be -1, i.e. the fastest-changing one, to match BS # determine stacking dimensions cell_shape_list = list(cell_shape) stacked_dim = cell_shape_list[0] cell_shape_list[stack_axis] = stacked_dim * 4 cell_shape_stacked = tuple( cell_shape_list) # patched dims with stack_axis duplicated 4 times # parameters b = Parameter(cell_shape_stacked, init=init_bias, name='b') # a bias W = Parameter(_inf.shape + cell_shape_stacked, init=init, name='W') # input A = Parameter(_inf.shape + cell_shape_stacked, init=init, name='A') if has_aux else None # aux input (optional) H = Parameter(shape + cell_shape_stacked, init=init, name='H') # hidden-to-hidden Ci = Parameter( cell_shape, init=init, name='Ci' ) if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Cf = Parameter( cell_shape, init=init, name='Cf' ) if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Co = Parameter( cell_shape, init=init, name='Co' ) if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Wmr = ParameterTensor( cell_shape + shape, init=init, init_value_scale=init_value_scale ) if has_projection else None # final projection Sdh = Stabilizer(_inf=_inf.with_shape( shape)) if enable_self_stabilization else Identity( _inf=_inf.with_shape(shape)) Sdc = Stabilizer(_inf=_inf.with_shape( cell_shape)) if enable_self_stabilization else Identity( _inf=_inf.with_shape(cell_shape)) Sct = Stabilizer(_inf=_inf.with_shape( cell_shape)) if enable_self_stabilization else Identity( _inf=_inf.with_shape(cell_shape)) Sht = Stabilizer(_inf=_inf.with_shape( shape)) if enable_self_stabilization else Identity( _inf=_inf.with_shape(shape)) def create_hc_placeholder(): return (Placeholder(_inf=_inf.with_shape(shape), name='hPh'), Placeholder(_inf=_inf.with_shape(cell_shape), name='cPh')) # (h, c) # parameters to model function x = Placeholder(_inf=_inf, name='lstm_block_arg') prev_state = create_hc_placeholder() # formula of model function dh, dc = prev_state dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + times(dhs, H) + times(aux, A) if has_aux else \ b + times(x, W) + times(dhs, H) it_proj = slice(proj4, stack_axis, 0 * stacked_dim, 1 * stacked_dim) # split along stack_axis bit_proj = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim) ft_proj = slice(proj4, stack_axis, 2 * stacked_dim, 3 * stacked_dim) ot_proj = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim) # add peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid(peep(it_proj, dcs, Ci)) # input gate(t) bit = it * tanh(bit_proj) # applied to tanh of input network ft = sigmoid(peep(ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid(peep(ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * tanh(ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else \ ht _name_node(h, 'h') if _trace_layers: _log_node(h) # this looks right _name_node(c, 'c') # TODO: figure out how to do scoping, and also rename all the apply... to expression apply_x_h_c = combine([h, c]) # return to caller a helper function to create placeholders for recurrence apply_x_h_c.create_placeholder = create_hc_placeholder _name_and_extend_Function(apply_x_h_c, 'LSTM') return apply_x_h_c
def __call__(self, *args): return _apply(self, _as_tuple(args))
def _Infer(shape, axis): return Record(shape=_as_tuple(shape), axis=axis, with_shape=lambda new_shape: _Infer(new_shape, axis))