def create_model(input_dim): row = sequence.input_variable(shape=input_dim) col = sequence.input_variable(shape=input_dim) rowh = Sequential([Embedding(opt.embed), Stabilizer(), Dropout(opt.dropout)])(row) colh = Sequential([Embedding(opt.embed), Stabilizer(), Dropout(opt.dropout)])(col) x = C.splice(rowh, colh, axis=-1) x = lightlstm(opt.embed, opt.nhid)(x) x = For(range(opt.layer-1), lambda: lightlstm(opt.nhid, opt.nhid))(x) rowh = C.slice(x, -1, opt.nhid * 0, opt.nhid * 1) colh = C.slice(x, -1, opt.nhid * 1, opt.nhid * 2) row_predict = Sequential([Dropout(opt.dropout), Dense(input_dim)])(rowh) col_predict = Sequential([Dropout(opt.dropout), Dense(input_dim)])(colh) # variable : row label and col label row_label = sequence.input_variable(shape=input_dim) col_label = sequence.input_variable(shape=input_dim) model = C.combine([row_predict, col_predict]) return {'row': row, 'col': col, 'row_label': row_label, 'col_label': col_label, 'model': model}
def create_model(input_sequence, label_sequence, vocab_dim, hidden_dim): # Create the rnn that computes the latent representation for the next token. rnn_with_latent_output = Sequential([ C.layers.Embedding(hidden_dim), For( range(num_layers), lambda: Sequential([ Stabilizer(), Recurrence(LSTM(hidden_dim), go_backwards=False) ])), ]) # Apply it to the input sequence. latent_vector = rnn_with_latent_output(input_sequence) # Connect the latent output to (sampled/full) softmax. if use_sampled_softmax: weights = load_sampling_weights(token_frequencies_file_path) smoothed_weights = np.float32(np.power(weights, alpha)) sampling_weights = C.reshape(C.Constant(smoothed_weights), shape=(1, vocab_dim)) z, ce, errs = cross_entropy_with_sampled_softmax( latent_vector, label_sequence, vocab_dim, hidden_dim, softmax_sample_size, sampling_weights) else: z, ce, errs = cross_entropy_with_full_softmax(latent_vector, label_sequence, vocab_dim, hidden_dim) return z, ce, errs
def create_model(output_dim): return Sequential([ For( range(num_layers), lambda: Sequential([ Stabilizer(), Recurrence(LSTM(hidden_dim), go_backwards=False) ])), Dense(output_dim) ])
def test_layers_stabilizer(): y = input(4) p = Stabilizer()(y) dat = np.array([[1.0,2.0,3.0,4.0]], dtype=np.float32) res = p(y).eval({y: dat}) # a stabilizer starts with having no effect, hence input=output np.testing.assert_array_almost_equal(res, dat, decimal=6, \ err_msg="Error in layer normalization computation")
X = np.eye(vocab_size, dtype=np.float32)[xi] Y = np.eye(vocab_size, dtype=np.float32)[yi] return [X], [Y] get_sample(0) input_sequence = sequence.input_variable(shape=vocab_size) label_sequence = sequence.input_variable(shape=vocab_size) model = Sequential([ For( range(2), lambda: Sequential( [Stabilizer(), Recurrence(LSTM(256), go_backwards=False)])), Dense(vocab_size) ]) z = model(input_sequence) z_sm = cntk.softmax(z) ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence) lr_per_sample = learning_rate_schedule(0.001, UnitType.sample) momentum_time_constant = momentum_as_time_constant_schedule(1100) clipping_threshold_per_sample = 5.0 gradient_clipping_with_truncation = True learner = momentum_sgd(
X = np.eye(vocab_size, dtype=np.float32)[xi] Y = np.eye(vocab_size, dtype=np.float32)[yi] return [X], [Y] sample(0) input_seq_axis = Axis('inputAxis') input_sequence = sequence.input_variable(shape=vocab_size, sequence_axis=input_seq_axis) label_sequence = sequence.input_variable(shape=vocab_size, sequence_axis=input_seq_axis) # model = Sequential([Dense(300),Dense(vocab_size)]) model = Sequential([ For(range(2), lambda: Sequential([Stabilizer(), Recurrence(LSTM(256), go_backwards=False)])), Dense(vocab_size)]) z = model(input_sequence) ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence) lr_per_sample = learning_rate_schedule(0.001, UnitType.sample) momentum_time_constant = momentum_as_time_constant_schedule(1100) clipping_threshold_per_sample = 5.0 gradient_clipping_with_truncation = True learner = momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, gradient_clipping_threshold_per_sample=clipping_threshold_per_sample, gradient_clipping_with_truncation=gradient_clipping_with_truncation) progress_printer = ProgressPrinter(freq=100, tag='Training')
def _RecurrentBlock(type, shape, cell_shape, activation, use_peepholes, init, init_bias, enable_self_stabilization, dropout_rate, seed, name=''): ''' Helper to create a recurrent block of type 'WeightDroppedLSTM', 'GRU', or RNNStep. ''' has_projection = cell_shape is not None shape = _as_tuple(shape) cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape if len(shape) != 1 or len(cell_shape) != 1: raise ValueError("%s: shape and cell_shape must be vectors (rank-1 tensors)" % type) # otherwise we'd need to fix slicing and Param initializers stack_axis = -1 # for efficient computation, we stack multiple variables (along the fastest-changing one, to match BS) # determine stacking dimensions cell_shape_list = list(cell_shape) stacked_dim = cell_shape_list[stack_axis] cell_shape_list[stack_axis] = stacked_dim * { 'IndRNN': 1, 'IndyLSTM': 4, 'WeightDroppedLSTM': 4 }[type] cell_shape_stacked = tuple(cell_shape_list) # patched dims with stack_axis duplicated 4 times cell_shape_list[stack_axis] = stacked_dim * { 'IndRNN': 1, 'IndyLSTM': 4, 'WeightDroppedLSTM': 4 }[type] cell_shape_stacked_H = tuple(cell_shape_list) # patched dims with stack_axis duplicated 4 times # parameters b = Parameter( cell_shape_stacked, init=init_bias, name='b') # bias W = Parameter(_INFERRED + cell_shape_stacked, init=init, name='W') # input H = Parameter(shape + cell_shape_stacked_H, init=init, name='H') # hidden-to-hidden H1 = Parameter( cell_shape_stacked_H, init=init, name='H1') if type == 'IndyLSTM' else None # hidden-to-hidden H2 = Parameter(shape , init=init, name='H2') if type == 'IndRNN' else None # hidden-to-hidden Ci = Parameter( cell_shape, init=init, name='Ci') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Cf = Parameter( cell_shape, init=init, name='Cf') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Co = Parameter( cell_shape, init=init, name='Co') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Wmr = Parameter(cell_shape + shape, init=init, name='P') if has_projection else None # final projection # each use of a stabilizer layer must get its own instance Sdh = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dh_stabilizer') Sdc = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dc_stabilizer') Sct = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='c_stabilizer') Sht = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='P_stabilizer') # DropConnect dropout = C.layers.Dropout(dropout_rate=dropout_rate, seed=seed, name='h_dropout') # define the model function itself # general interface for Recurrence(): # (all previous outputs delayed, input) --> (outputs and state) # where # - the first output is the main output, e.g. 'h' for LSTM # - the remaining outputs, if any, are additional state # - if for some reason output != state, then output is still fed back and should just be ignored by the recurrent block # LSTM model function # in this case: # (dh, dc, x) --> (h, c) def weight_dropped_lstm(dh, dc, x): dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + times(dhs, dropout(H)) it_proj = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim) # split along stack_axis bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim) ft_proj = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim) ot_proj = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim) # helper to inject peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid(peep(it_proj, dcs, Ci)) # input gate(t) # TODO: should both activations be replaced? bit = it * activation(bit_proj) # applied to tanh of input network ft = sigmoid(peep(ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid(peep(ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * activation(ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else ht return h, c # LSTM model function # in this case: # (dh, dc, x) --> (h, c) def indy_lstm(dh, dc, x): dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + C.splice(dhs, dhs, dhs, dhs) * H1 # 4 is the number of stacked dim it_proj = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim) # split along stack_axis bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim) ft_proj = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim) ot_proj = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim) # helper to inject peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid(peep(it_proj, dcs, Ci)) # input gate(t) # TODO: should both activations be replaced? bit = it * activation(bit_proj) # applied to tanh of input network ft = sigmoid(peep(ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid(peep(ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * activation(ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else ht return h, c def ind_rnn(dh, x): dhs = Sdh(dh) # previous value, stabilized ht = activation(times(x, W) + dhs * H2 + b) h = times(Sht(ht), Wmr) if has_projection else ht return h function = { 'IndRNN': ind_rnn, 'IndyLSTM': indy_lstm, 'WeightDroppedLSTM': weight_dropped_lstm }[type] # return the corresponding lambda as a CNTK Function return BlockFunction(type, name)(function)