def _sanitize_minibatch_source(minibatch_source, model_inputs_to_streams, criterion, infinitely_repeat=True): ''' Helper to wrap numpy/scipy data into a minibatch source. ''' from ..io import MinibatchSource, UserMinibatchSource, MinibatchSourceFromData, INFINITELY_REPEAT if minibatch_source and not isinstance( minibatch_source, (MinibatchSource, UserMinibatchSource) ): # UserMinibatchSource derives from cntk_py.SwigMinibatchSource, not MinibatchSource, for director purposes args = _as_tuple( minibatch_source ) # the minibatch_source is a tuple of numpy or scipy arrays that we construct a source around # args can also be a tuple of numpy/scipy arrays; we will construct on the fly if criterion is None: raise ValueError( "when passing data directly in place of a minibatch source, criterion must be given" ) params = criterion.arguments if len(params) != len(args): raise ValueError( "to pass data directly in place of a minibatch source, pass a tuple of {} numpy or scipy arrays, in the order of the arguments of the criterion function. You passed {} value(s)" .format(len(params), len(args))) param_names = [ param.name if param.name else "stream_%s" % i for i, param in enumerate(params) ] # names are for debugging... if len(params) != len( set(param_names) ): # ...and for stream names and thus must be unique. If multiple inputs have the same names... param_names = ["stream_%s" % i for i, _ in enumerate(params) ] # ...we fall back to generic names param_types = [param._type for param in params] max_samples = INFINITELY_REPEAT if infinitely_repeat else len( args[0]) # if not infinite then do one data pass minibatch_source = MinibatchSourceFromData( { name: (input, type) for name, input, type in zip(param_names, args, param_types) }, max_samples=max_samples) if model_inputs_to_streams is not None: raise ValueError( "mapping must not be provided when data is passed directly" ) model_inputs_to_streams = { param: minibatch_source.streams[name] for param, name in zip(params, param_names) } return minibatch_source, model_inputs_to_streams
def gru_cell(shape, init=glorot_uniform(), name=''): # (x, (h,c)) """ GRU cell function """ shape = _as_tuple(shape) if len(shape) != 1: raise ValueError("gru_cell: shape must be vectors (rank-1 tensors)") # determine stacking dimensions cell_shape_stacked = shape * 2 # patched dims with stack_axis duplicated 2 times # parameters Wz = Parameter(cell_shape_stacked, init=init, name='Wz') Wr = Parameter(cell_shape_stacked, init=init, name='Wr') Wh = Parameter(cell_shape_stacked, init=init, name='Wh') Uz = Parameter(_INFERRED + shape, init=init, name='Uz') Ur = Parameter(_INFERRED + shape, init=init, name='Ur') Uh = Parameter(_INFERRED + shape, init=init, name='Uh') def create_s_placeholder(): # we pass the known dimensions here, which makes dimension inference easier return Placeholder(shape=shape, name='S') # (h, c) # parameters to model function x = Placeholder(name='gru_block_arg') prev_status = create_s_placeholder() # formula of model function Sn_1 = prev_status z = sigmoid(times(x, Uz, name='x*Uz') + times(Sn_1, Wz, name='Sprev*Wz'), name='z') r = sigmoid(times(x, Ur, name='x*Ur') + times(Sn_1, Wr, name='Sprev*Wr'), name='r') h = tanh(times(x, Uh, name='x*Uh') + times(element_times(Sn_1, r, name='Sprev*r'), Wh), name='h') s = plus(element_times((1 - z), h, name='(1-z)*h'), element_times(z, Sn_1, name='z*SPrev'), name=name) apply_x_s = combine([s]) apply_x_s.create_placeholder = create_s_placeholder return apply_x_s
def _sanitize_minibatch_source(minibatch_source, model_inputs_to_streams, criterion, infinitely_repeat=True): ''' Helper to wrap numpy/scipy data into a minibatch source. ''' from ..io import MinibatchSource, UserMinibatchSource, MinibatchSourceFromData, INFINITELY_REPEAT if minibatch_source and not isinstance(minibatch_source, (MinibatchSource, UserMinibatchSource)): # UserMinibatchSource derives from cntk_py.SwigMinibatchSource, not MinibatchSource, for director purposes args = _as_tuple(minibatch_source) # the minibatch_source is a tuple of numpy or scipy arrays that we construct a source around # args can also be a tuple of numpy/scipy arrays; we will construct on the fly if criterion is None: raise ValueError("when passing data directly in place of a minibatch source, criterion must be given") params = criterion.arguments if len(params) != len(args): raise ValueError("to pass data directly in place of a minibatch source, pass a tuple of {} numpy or scipy arrays, in the order of the arguments of the criterion function. You passed {} value(s)" .format(len(params), len(args))) param_names = [param.name if param.name else "stream_%s" % i for i, param in enumerate(params)] # names are for debugging... if len(params) != len(set(param_names)): # ...and for stream names and thus must be unique. If multiple inputs have the same names... param_names = ["stream_%s" % i for i, _ in enumerate(params)] # ...we fall back to generic names param_types = [param._type for param in params] max_samples = INFINITELY_REPEAT if infinitely_repeat else len(args[0]) # if not infinite then do one data pass minibatch_source = MinibatchSourceFromData({name: (input, type) for name, input, type in zip(param_names, args, param_types)}, max_samples=max_samples) if model_inputs_to_streams is not None: raise ValueError( "mapping must not be provided when data is passed directly") model_inputs_to_streams = {param: minibatch_source.streams[name] for param, name in zip(params, param_names)} return minibatch_source, model_inputs_to_streams
def _RecurrentBlock(type, shape, cell_shape, activation, use_peepholes, init, init_bias, enable_self_stabilization, name=''): ''' Helper to create a recurrent block of type 'LSTM', 'GRU', or RNNUnit. ''' has_projection = cell_shape is not None shape = _as_tuple(shape) cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape if len(shape) != 1 or len(cell_shape) != 1: raise ValueError( "%s: shape and cell_shape must be vectors (rank-1 tensors)" % type) # otherwise we'd need to fix slicing and Param initializers stack_axis = -1 # for efficient computation, we stack multiple variables (along the fastest-changing one, to match BS) # determine stacking dimensions cell_shape_list = list(cell_shape) stacked_dim = cell_shape_list[stack_axis] cell_shape_list[stack_axis] = stacked_dim * { 'RNNUnit': 1, 'GRU': 3, 'LSTM': 4 }[type] cell_shape_stacked = tuple( cell_shape_list) # patched dims with stack_axis duplicated 4 times cell_shape_list[stack_axis] = stacked_dim * { 'RNNUnit': 1, 'GRU': 2, 'LSTM': 4 }[type] cell_shape_stacked_H = tuple( cell_shape_list) # patched dims with stack_axis duplicated 4 times # parameters b = Parameter(cell_shape_stacked, init=init_bias, name='b') # bias W = Parameter(_INFERRED + cell_shape_stacked, init=init, name='W') # input H = Parameter(shape + cell_shape_stacked_H, init=init, name='H') # hidden-to-hidden H1 = Parameter(shape + cell_shape, init=init, name='H1') if type == 'GRU' else None # hidden-to-hidden Ci = Parameter( cell_shape, init=init, name='Ci' ) if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Cf = Parameter( cell_shape, init=init, name='Cf' ) if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Co = Parameter( cell_shape, init=init, name='Co' ) if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Wmr = Parameter(cell_shape + shape, init=init, name='P') if has_projection else None # final projection # each use of a stabilizer layer must get its own instance Sdh = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dh_stabilizer') Sdc = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dc_stabilizer') Sct = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='c_stabilizer') Sht = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='P_stabilizer') # define the model function itself # general interface for Recurrence(): # (all previous outputs delayed, input) --> (outputs and state) # where # - the first output is the main output, e.g. 'h' for LSTM # - the remaining outputs, if any, are additional state # - if for some reason output != state, then output is still fed back and should just be ignored by the recurrent block # LSTM model function # in this case: # (dh, dc, x) --> (h, c) def lstm(dh, dc, x): dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + times(dhs, H) it_proj = slice(proj4, stack_axis, 0 * stacked_dim, 1 * stacked_dim) # split along stack_axis bit_proj = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim) ft_proj = slice(proj4, stack_axis, 2 * stacked_dim, 3 * stacked_dim) ot_proj = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim) # helper to inject peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid(peep(it_proj, dcs, Ci)) # input gate(t) # TODO: should both activations be replaced? bit = it * activation(bit_proj) # applied to tanh of input network ft = sigmoid(peep(ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid(peep(ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * activation(ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else \ ht # returns the new state as a tuple with names but order matters #return (Function.NamedOutput(h=h), Function.NamedOutput(c=c)) return (h, c) # GRU model function # in this case: # (dh, x) --> (h) # e.g. https://en.wikipedia.org/wiki/Gated_recurrent_unit def gru(dh, x): dhs = Sdh(dh) # previous value, stabilized # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias projx3 = b + times(x, W) projh2 = times(dhs, H) zt_proj = slice( projx3, stack_axis, 0 * stacked_dim, 1 * stacked_dim) + slice( projh2, stack_axis, 0 * stacked_dim, 1 * stacked_dim) rt_proj = slice( projx3, stack_axis, 1 * stacked_dim, 2 * stacked_dim) + slice( projh2, stack_axis, 1 * stacked_dim, 2 * stacked_dim) ct_proj = slice(projx3, stack_axis, 2 * stacked_dim, 3 * stacked_dim) zt = sigmoid(zt_proj) # update gate z(t) rt = sigmoid(rt_proj) # reset gate r(t) rs = dhs * rt # "cell" c ct = activation(ct_proj + times(rs, H1)) ht = (1 - zt) * ct + zt * dhs # hidden state ht / output # for comparison: CUDNN_GRU # i(t) = sigmoid(W_i x(t) + R_i h(t-1) + b_Wi + b_Ru) # r(t) = sigmoid(W_r x(t) + R_r h(t-1) + b_Wr + b_Rr) --same up to here # h'(t) = tanh(W_h x(t) + r(t) .* (R_h h(t-1)) + b_Wh + b_Rh) --r applied after projection? Would make life easier! # h(t) = (1 - i(t) .* h'(t)) + i(t) .* h(t-1) --TODO: need to confirm bracketing with NVIDIA h = times(Sht(ht), Wmr) if has_projection else \ ht # returns the new state as a tuple with names but order matters #return Function.NamedOutput(h=h) return h def rnn(dh, x): dhs = Sdh(dh) # previous value, stabilized ht = activation(times(x, W) + times(dhs, H) + b) h = times(Sht(ht), Wmr) if has_projection else \ ht #return Function.NamedOutput(h=h) return h function = {'RNNUnit': rnn, 'GRU': gru, 'LSTM': lstm}[type] # return the corresponding lambda as a CNTK Function return BlockFunction(type, name)(function)
def IndRNNBlock(type, shape, cell_shape, activation, use_peepholes, init, init_bias, enable_self_stabilization, name=''): ''' Helper to create a recurrent block of type 'LSTM', 'GRU', or RNNStep. ''' has_projection = cell_shape is not None shape = _as_tuple(shape) cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape if len(shape) != 1 or len(cell_shape) != 1: raise ValueError( "%s: shape and cell_shape must be vectors (rank-1 tensors)" % type) # otherwise we'd need to fix slicing and Param initializers stack_axis = -1 # for efficient computation, we stack multiple variables (along the fastest-changing one, to match BS) # determine stacking dimensions cell_shape_list = list(cell_shape) stacked_dim = cell_shape_list[stack_axis] cell_shape_list[stack_axis] = stacked_dim * 1 cell_shape_stacked = tuple( cell_shape_list) # patched dims with stack_axis duplicated 4 times cell_shape_list[stack_axis] = stacked_dim * 1 cell_shape_stacked_H = tuple( cell_shape_list) # patched dims with stack_axis duplicated 4 times # parameters b = Parameter(cell_shape_stacked, init=init_bias, name='b') # bias W = Parameter(_INFERRED + cell_shape_stacked, init=init, name='W') # input H = Parameter(cell_shape_stacked_H, init=init, name='H') # hidden-to-hidden Wmr = Parameter(cell_shape + shape, init=init, name='P') if has_projection else None # final projection # each use of a stabilizer layer must get its own instance Sdh = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dh_stabilizer') Sdc = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dc_stabilizer') Sct = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='c_stabilizer') Sht = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='P_stabilizer') def rnn_step(dh, x): dhs = Sdh(dh) # previous value, stabilized ht = activation(times(x, W) + dhs * H + b) h = times(Sht(ht), Wmr) if has_projection else \ ht return h function = { 'RNNStep': rnn_step, }[type] # return the corresponding lambda as a CNTK Function return BlockFunction(type, name)(function)
def _RecurrentBlock(type, shape, cell_shape, activation, use_peepholes, init, init_bias, enable_self_stabilization, name=''): ''' Helper to create a recurrent block of type 'LSTM', 'GRU', or RNNUnit. ''' has_projection = cell_shape is not None shape = _as_tuple(shape) cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape if len(shape) != 1 or len(cell_shape) != 1: raise ValueError("%s: shape and cell_shape must be vectors (rank-1 tensors)" % type) # otherwise we'd need to fix slicing and Param initializers stack_axis = -1 # for efficient computation, we stack multiple variables (along the fastest-changing one, to match BS) # determine stacking dimensions cell_shape_list = list(cell_shape) stacked_dim = cell_shape_list[stack_axis] cell_shape_list[stack_axis] = stacked_dim * { 'RNNUnit': 1, 'GRU': 3, 'LSTM': 4 }[type] cell_shape_stacked = tuple(cell_shape_list) # patched dims with stack_axis duplicated 4 times cell_shape_list[stack_axis] = stacked_dim * { 'RNNUnit': 1, 'GRU': 2, 'LSTM': 4 }[type] cell_shape_stacked_H = tuple(cell_shape_list) # patched dims with stack_axis duplicated 4 times # parameters b = Parameter( cell_shape_stacked, init=init_bias, name='b') # bias W = Parameter(_INFERRED + cell_shape_stacked, init=init, name='W') # input H = Parameter(shape + cell_shape_stacked_H, init=init, name='H') # hidden-to-hidden H1 = Parameter(shape + cell_shape, init=init, name='H1') if type == 'GRU' else None # hidden-to-hidden Ci = Parameter( cell_shape, init=init, name='Ci') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Cf = Parameter( cell_shape, init=init, name='Cf') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Co = Parameter( cell_shape, init=init, name='Co') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Wmr = Parameter(cell_shape + shape, init=init, name='P') if has_projection else None # final projection # each use of a stabilizer layer must get its own instance Sdh = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dh_stabilizer') Sdc = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dc_stabilizer') Sct = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='c_stabilizer') Sht = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='P_stabilizer') # define the model function itself # general interface for Recurrence(): # (all previous outputs delayed, input) --> (outputs and state) # where # - the first output is the main output, e.g. 'h' for LSTM # - the remaining outputs, if any, are additional state # - if for some reason output != state, then output is still fed back and should just be ignored by the recurrent block # LSTM model function # in this case: # (dh, dc, x) --> (h, c) def lstm(dh, dc, x): dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + times(dhs, H) it_proj = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim) # split along stack_axis bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim) ft_proj = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim) ot_proj = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim) # helper to inject peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid (peep (it_proj, dcs, Ci)) # input gate(t) # TODO: should both activations be replaced? bit = it * activation (bit_proj) # applied to tanh of input network ft = sigmoid (peep (ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid (peep (ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * activation (ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else \ ht # returns the new state as a tuple with names but order matters return (Function.NamedOutput(h=h), Function.NamedOutput(c=c)) # GRU model function # in this case: # (dh, x) --> (h) # e.g. https://en.wikipedia.org/wiki/Gated_recurrent_unit def gru(dh, x): dhs = Sdh(dh) # previous value, stabilized # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias projx3 = b + times(x, W) projh2 = times(dhs, H) zt_proj = slice (projx3, stack_axis, 0*stacked_dim, 1*stacked_dim) + slice (projh2, stack_axis, 0*stacked_dim, 1*stacked_dim) rt_proj = slice (projx3, stack_axis, 1*stacked_dim, 2*stacked_dim) + slice (projh2, stack_axis, 1*stacked_dim, 2*stacked_dim) ct_proj = slice (projx3, stack_axis, 2*stacked_dim, 3*stacked_dim) zt = sigmoid (zt_proj) # update gate z(t) rt = sigmoid (rt_proj) # reset gate r(t) rs = dhs * rt # "cell" c ct = activation (ct_proj + times(rs, H1)) ht = (1 - zt) * ct + zt * dhs # hidden state ht / output # for comparison: CUDNN_GRU # i(t) = sigmoid(W_i x(t) + R_i h(t-1) + b_Wi + b_Ru) # r(t) = sigmoid(W_r x(t) + R_r h(t-1) + b_Wr + b_Rr) --same up to here # h'(t) = tanh(W_h x(t) + r(t) .* (R_h h(t-1)) + b_Wh + b_Rh) --r applied after projection? Would make life easier! # h(t) = (1 - i(t) .* h'(t)) + i(t) .* h(t-1) --TODO: need to confirm bracketing with NVIDIA h = times(Sht(ht), Wmr) if has_projection else \ ht # returns the new state as a tuple with names but order matters return Function.NamedOutput(h=h) def rnn(dh, x): dhs = Sdh(dh) # previous value, stabilized ht = activation (times(x, W) + times(dhs, H) + b) h = times(Sht(ht), Wmr) if has_projection else \ ht return Function.NamedOutput(h=h) function = { 'RNNUnit': rnn, 'GRU': gru, 'LSTM': lstm }[type] # return the corresponding lambda as a CNTK Function return BlockFunction(type, name)(function)
def _RecurrentBlock(type, shape, cell_shape, activation, use_peepholes, init, init_bias, enable_self_stabilization, name=''): has_projection = cell_shape is not None shape = _as_tuple(shape) cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape if len(shape) != 1 or len(cell_shape) != 1: raise ValueError( "%s: shape and cell_shape must be vectors (rank-1 tensors)" % type) stack_axis = -1 # determine stacking dimensions cell_shape_list = list(cell_shape) cell_shape_list_W = list(cell_shape) cell_shape_list_H = list(cell_shape) cell_shape_stacked = tuple(cell_shape_list) shape_list = list(shape) # slot value pair onehot vector dimension sv_dim = cell_shape_list[stack_axis] stacked_dim = shape_list[stack_axis] sv_shape_stacked = tuple([sv_dim]) # 3*hidden_dim cell_shape_list[stack_axis] = stacked_dim * {'LSTM': 3}[type] cell_shape_stacked = tuple(cell_shape_list) # 2*hidden_dim + sv_dim cell_shape_list_H[stack_axis] = stacked_dim * {'LSTM': 2}[type] + sv_dim cell_shape_stacked_H = tuple(cell_shape_list_H) cell_shape_list_W[stack_axis] = stacked_dim * {'LSTM': 2}[type] cell_shape_stacked_W = tuple(cell_shape_list_W) # parameters b = Parameter(cell_shape_stacked, init=init_bias, name='b') # bias brg = Parameter(sv_shape_stacked, init=init_bias, name='brg') # bias W = Parameter(_INFERRED + cell_shape_stacked, init=init, name='W') Wrg = Parameter(_INFERRED + sv_shape_stacked, init=init, name='Wrg') Wcx = Parameter(_INFERRED + shape, init=init, name='Wcx') H = Parameter(shape + cell_shape_stacked, init=init, name='H') Hrg = Parameter(shape + sv_shape_stacked, init=init, name='Hrg') Hcx = Parameter(shape + shape, init=init, name='Hcx') Hsv = Parameter(sv_shape_stacked + cell_shape_stacked, init=init, name='Hsv') Hsvrg = Parameter(sv_shape_stacked + sv_shape_stacked, init=init, name='Hsvrg') Wfc = Parameter(sv_shape_stacked + shape, init=init, name='Wfc') # LSTM model function # in this case: # (dh, dc, sv, x) --> (h, c, sv) def lstm(dh, dc, sv, x): # projected contribution from input(s), hidden, and bias proj3 = b + times(x, W) + times(dh, H) + times(sv, Hsv) it_proj = slice(proj3, stack_axis, 0 * stacked_dim, 1 * stacked_dim) ft_proj = slice(proj3, stack_axis, 1 * stacked_dim, 2 * stacked_dim) ot_proj = slice(proj3, stack_axis, 2 * stacked_dim, 3 * stacked_dim) it = sigmoid(it_proj) # input gate(t) ft = sigmoid(ft_proj) # forget-me-not gate(t) ot = sigmoid(ot_proj) # output gate(t) # the following is reading gate proj3rg = sigmoid( times(x, Wrg) + times(dh, Hrg) + times(sv, Hsvrg) + brg) v = proj3rg * sv cx_t = tanh(times(x, Wcx) + times(dh, Hcx)) # need to do stablization ?? # update memory cell c = it * cx_t + ft * dc + tanh(times(v, Wfc)) h = ot * tanh(c) return (h, c, v) function = {'LSTM': lstm}[type] # return the corresponding lambda as a CNTK Function return BlockFunction(type, name)(function)
def dense_factored(shapes, #(shape1, shape2) activation=default_override_or(identity), init={'W1':None, 'W2':None}, input_rank=None, map_rank=None, bias=default_override_or(True), init_bias=default_override_or(0), name=''): ''' Perform the new model creation using the factored inputs W1 and W2. The returend function represents the new model. Args: shapes : dimensions of the input matrices. activation : activation function used for the model. init : the two matrices corresponding to the factorization. input_rank : rank of the input tensor. map_rank : ??? bias : bias for the model. init_bias : initial bias value. name : name of the block function that creates the new model. Returns: a model that is factored and projected (reduced). ''' # matthaip: Not sure how to handle input tensor of rank > 1 # or selective flattening of ranks assert(input_rank is None and map_rank is None and all(isinstance(s,int) for s in list(shapes))) activation = get_default_override(cntk.layers.Dense, activation=activation) bias = get_default_override(cntk.layers.Dense, bias=bias) init_bias = get_default_override(cntk.layers.Dense, init_bias=init_bias) # how to use get_default_override for init parameeter? output_shape1 = _as_tuple(shapes[0]) output_shape2 = _as_tuple(shapes[1]) if input_rank is not None and map_rank is not None: raise ValueError("Dense: input_rank and map_rank cannot be specified at the same time.") # If input_rank not given then pass a single _INFERRED; # map_rank if given will determine the input_rank. # The dimension inference may still create multiple axes. input_shape = _INFERRED # parameters bound to this Function # init_weights = _initializer_for(init, Record(output_rank=output_rank)) init_weights = init W1 = Parameter(input_shape + output_shape1, init=init_weights['W1'], name='W1') W2 = Parameter(output_shape1 + output_shape2, init=init_weights['W2'], name='W2') b = Parameter(output_shape2, init=init_bias, name='b') if bias else None # expression of this function @BlockFunction('DenseFactored', name) def dense(x): r = times(x, W1) r = times(r, W2) if b: r = r + b if activation is not None: r = activation(r) return r return dense
def create_model(params: model_params): """ Create ReasoNet model Args: params (class:`model_params`): The parameters used to create the model """ logger.log( "Create model: dropout_rate: {0}, init:{1}, embedding_init: {2}". format(params.dropout_rate, params.init, params.embedding_init)) # Query and Doc/Context/Paragraph inputs to the model query_seq_axis = Axis('sourceAxis') context_seq_axis = Axis('contextAxis') query_sequence = sequence.input(shape=(params.vocab_dim), is_sparse=True, sequence_axis=query_seq_axis, name='query') context_sequence = sequence.input(shape=(params.vocab_dim), is_sparse=True, sequence_axis=context_seq_axis, name='context') entity_ids_mask = sequence.input(shape=(1, ), is_sparse=False, sequence_axis=context_seq_axis, name='entity_ids_mask') # embedding if params.embedding_init is None: embedding_init = create_random_matrix(params.vocab_dim, params.embedding_dim) else: embedding_init = params.embedding_init embedding = parameter(shape=(params.vocab_dim, params.embedding_dim), init=None) embedding.value = embedding_init embedding_matrix = constant(embedding_init, shape=(params.vocab_dim, params.embedding_dim)) if params.dropout_rate is not None: query_embedding = ops.dropout(times(query_sequence, embedding), params.dropout_rate, name='query_embedding') context_embedding = ops.dropout(times(context_sequence, embedding), params.dropout_rate, name='context_embedding') else: query_embedding = times(query_sequence, embedding, name='query_embedding') context_embedding = times(context_sequence, embedding, name='context_embedding') contextGruW = Parameter(_INFERRED + _as_tuple(params.hidden_dim), init=glorot_uniform(), name='gru_params') queryGruW = Parameter(_INFERRED + _as_tuple(params.hidden_dim), init=glorot_uniform(), name='gru_params') entity_embedding = ops.times(context_sequence, embedding_matrix, name='constant_entity_embedding') # Unlike other words in the context, we keep the entity vectors fixed as a random vector so that each vector just means an identifier of different entities in the context and it has no semantic meaning full_context_embedding = ops.element_select(entity_ids_mask, entity_embedding, context_embedding) context_memory = ops.optimized_rnnstack(full_context_embedding, contextGruW, params.hidden_dim, 1, True, recurrent_op='gru', name='context_mem') query_memory = ops.optimized_rnnstack(query_embedding, queryGruW, params.hidden_dim, 1, True, recurrent_op='gru', name='query_mem') qfwd = ops.slice(sequence.last(query_memory), -1, 0, params.hidden_dim, name='fwd') qbwd = ops.slice(sequence.first(query_memory), -1, params.hidden_dim, params.hidden_dim * 2, name='bwd') init_status = ops.splice( qfwd, qbwd, name='Init_Status') # get last fwd status and first bwd status return attention_model(context_memory, query_memory, init_status, params.hidden_dim, params.attention_dim, max_steps=params.max_rl_steps)
def dense_factored( shapes, #(shape1, shape2) activation=default_override_or(identity), init={ 'W1': None, 'W2': None }, input_rank=None, map_rank=None, bias=default_override_or(True), init_bias=default_override_or(0), name=''): ''' Perform the new model creation using the factored inputs W1 and W2. The returend function represents the new model. Args: shapes : dimensions of the input matrices. activation : activation function used for the model. init : the two matrices corresponding to the factorization. input_rank : rank of the input tensor. map_rank : ??? bias : bias for the model. init_bias : initial bias value. name : name of the block function that creates the new model. Returns: a model that is factored and projected (reduced). ''' # matthaip: Not sure how to handle input tensor of rank > 1 # or selective flattening of ranks assert (input_rank is None and map_rank is None and all(isinstance(s, int) for s in list(shapes))) activation = get_default_override(cntk.layers.Dense, activation=activation) bias = get_default_override(cntk.layers.Dense, bias=bias) init_bias = get_default_override(cntk.layers.Dense, init_bias=init_bias) # how to use get_default_override for init parameeter? output_shape1 = _as_tuple(shapes[0]) output_shape2 = _as_tuple(shapes[1]) if input_rank is not None and map_rank is not None: raise ValueError( "Dense: input_rank and map_rank cannot be specified at the same time." ) # If input_rank not given then pass a single _INFERRED; # map_rank if given will determine the input_rank. # The dimension inference may still create multiple axes. input_shape = _INFERRED # parameters bound to this Function # init_weights = _initializer_for(init, Record(output_rank=output_rank)) init_weights = init W1 = Parameter(input_shape + output_shape1, init=init_weights['W1'], name='W1') W2 = Parameter(output_shape1 + output_shape2, init=init_weights['W2'], name='W2') b = Parameter(output_shape2, init=init_bias, name='b') if bias else None # expression of this function @BlockFunction('DenseFactored', name) def dense(x): r = times(x, W1) r = times(r, W2) if b: r = r + b if activation is not None: r = activation(r) return r return dense
def LSTM(shape, activation=default_override_or(tanh), weight_drop_rate=None, ih_init=default_override_or(glorot_uniform()), ih_bias=default_override_or(0), hh_init=default_override_or(glorot_uniform()), hh_bias=default_override_or(0), name=''): """ PyTorch style implementation of LSTM. Used for loading pytorch pretrained models. This difference between this implementation and cntk's one is that the slicing of the recurrent weights are different. pytorch is ifgo but cntk is igfo. And pytorch has 2 biases, but cntk only has one. In this implementation, i kept the biases to one to speed it up a little more. """ activation = get_default_override(LSTM, activation=activation) ih_init = get_default_override(LSTM, ih_init=ih_init) ih_bias = get_default_override(LSTM, ih_bias=ih_bias) hh_init = get_default_override(LSTM, hh_init=hh_init) hh_bias = get_default_override(LSTM, hh_bias=hh_bias) stack_axis = - 1 shape = _as_tuple(shape) cell_shape = shape cell_shape_list = list(cell_shape) stacked_dim = cell_shape_list[stack_axis] cell_shape_list[stack_axis] = stacked_dim * 4 cell_shape_stacked = tuple(cell_shape_list) # patched dims with stack_axis duplicated 4 times cell_shape_list[stack_axis] = stacked_dim * 4 cell_shape_stacked_H = tuple(cell_shape_list) # patched dims with stack_axis duplicated 4 times init_bias = ih_bias + hh_bias # combine both biases in pytorch into one b = Parameter( cell_shape_stacked, init=init_bias, name='b') # bias W = Parameter(_INFERRED + cell_shape_stacked, init=ih_init, name='W') # input H = Parameter(shape + cell_shape_stacked_H, init=hh_init, name='H') # hidden-to-hidden dropout = C.layers.Dropout(dropout_rate=weight_drop_rate, name='h_dropout') if weight_drop_rate is not None else None @C.BlockFunction('PT::LSTM', name) def lstm(dh, dc, x): # projected contribution from input(s), hidden, and bias dropped_H = dropout(H) if weight_drop_rate is not None else H proj4 = b + times(x, W) + times(dh, dropped_H) # slicing layout different from cntk's implementation it_proj = slice(proj4, stack_axis, 0 * stacked_dim, 1 * stacked_dim) # split along stack_axis ft_proj = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim) bit_proj = slice(proj4, stack_axis, 2 * stacked_dim, 3 * stacked_dim) # g gate ot_proj = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim) it = sigmoid(it_proj) # input gate(t) bit = it * activation(bit_proj) # applied to tanh of input network ft = sigmoid(ft_proj) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid(ot_proj) # output gate(t) ht = ot * activation(ct) # applied to tanh(cell(t)) return ht, ct return lstm
def _RecurrentBlock(type, shape, cell_shape, activation, use_peepholes, init, init_bias, enable_self_stabilization, dropout_rate, seed, name=''): ''' Helper to create a recurrent block of type 'WeightDroppedLSTM', 'GRU', or RNNStep. ''' has_projection = cell_shape is not None shape = _as_tuple(shape) cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape if len(shape) != 1 or len(cell_shape) != 1: raise ValueError("%s: shape and cell_shape must be vectors (rank-1 tensors)" % type) # otherwise we'd need to fix slicing and Param initializers stack_axis = -1 # for efficient computation, we stack multiple variables (along the fastest-changing one, to match BS) # determine stacking dimensions cell_shape_list = list(cell_shape) stacked_dim = cell_shape_list[stack_axis] cell_shape_list[stack_axis] = stacked_dim * { 'IndRNN': 1, 'IndyLSTM': 4, 'WeightDroppedLSTM': 4 }[type] cell_shape_stacked = tuple(cell_shape_list) # patched dims with stack_axis duplicated 4 times cell_shape_list[stack_axis] = stacked_dim * { 'IndRNN': 1, 'IndyLSTM': 4, 'WeightDroppedLSTM': 4 }[type] cell_shape_stacked_H = tuple(cell_shape_list) # patched dims with stack_axis duplicated 4 times # parameters b = Parameter( cell_shape_stacked, init=init_bias, name='b') # bias W = Parameter(_INFERRED + cell_shape_stacked, init=init, name='W') # input H = Parameter(shape + cell_shape_stacked_H, init=init, name='H') # hidden-to-hidden H1 = Parameter( cell_shape_stacked_H, init=init, name='H1') if type == 'IndyLSTM' else None # hidden-to-hidden H2 = Parameter(shape , init=init, name='H2') if type == 'IndRNN' else None # hidden-to-hidden Ci = Parameter( cell_shape, init=init, name='Ci') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Cf = Parameter( cell_shape, init=init, name='Cf') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Co = Parameter( cell_shape, init=init, name='Co') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Wmr = Parameter(cell_shape + shape, init=init, name='P') if has_projection else None # final projection # each use of a stabilizer layer must get its own instance Sdh = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dh_stabilizer') Sdc = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dc_stabilizer') Sct = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='c_stabilizer') Sht = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='P_stabilizer') # DropConnect dropout = C.layers.Dropout(dropout_rate=dropout_rate, seed=seed, name='h_dropout') # define the model function itself # general interface for Recurrence(): # (all previous outputs delayed, input) --> (outputs and state) # where # - the first output is the main output, e.g. 'h' for LSTM # - the remaining outputs, if any, are additional state # - if for some reason output != state, then output is still fed back and should just be ignored by the recurrent block # LSTM model function # in this case: # (dh, dc, x) --> (h, c) def weight_dropped_lstm(dh, dc, x): dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + times(dhs, dropout(H)) it_proj = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim) # split along stack_axis bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim) ft_proj = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim) ot_proj = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim) # helper to inject peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid(peep(it_proj, dcs, Ci)) # input gate(t) # TODO: should both activations be replaced? bit = it * activation(bit_proj) # applied to tanh of input network ft = sigmoid(peep(ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid(peep(ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * activation(ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else ht return h, c # LSTM model function # in this case: # (dh, dc, x) --> (h, c) def indy_lstm(dh, dc, x): dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + C.splice(dhs, dhs, dhs, dhs) * H1 # 4 is the number of stacked dim it_proj = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim) # split along stack_axis bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim) ft_proj = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim) ot_proj = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim) # helper to inject peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid(peep(it_proj, dcs, Ci)) # input gate(t) # TODO: should both activations be replaced? bit = it * activation(bit_proj) # applied to tanh of input network ft = sigmoid(peep(ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid(peep(ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * activation(ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else ht return h, c def ind_rnn(dh, x): dhs = Sdh(dh) # previous value, stabilized ht = activation(times(x, W) + dhs * H2 + b) h = times(Sht(ht), Wmr) if has_projection else ht return h function = { 'IndRNN': ind_rnn, 'IndyLSTM': indy_lstm, 'WeightDroppedLSTM': weight_dropped_lstm }[type] # return the corresponding lambda as a CNTK Function return BlockFunction(type, name)(function)