def gru(dh, x): dhs = Sdh(dh) # previous value, stabilized # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias projx3 = b + times(x, W) projh2 = times(dhs, H) zt_proj = slice (projx3, stack_axis, 0*stacked_dim, 1*stacked_dim) + slice (projh2, stack_axis, 0*stacked_dim, 1*stacked_dim) rt_proj = slice (projx3, stack_axis, 1*stacked_dim, 2*stacked_dim) + slice (projh2, stack_axis, 1*stacked_dim, 2*stacked_dim) ct_proj = slice (projx3, stack_axis, 2*stacked_dim, 3*stacked_dim) zt = sigmoid (zt_proj) # update gate z(t) rt = sigmoid (rt_proj) # reset gate r(t) rs = dhs * rt # "cell" c ct = activation (ct_proj + times(rs, H1)) ht = (1 - zt) * ct + zt * dhs # hidden state ht / output # for comparison: CUDNN_GRU # i(t) = sigmoid(W_i x(t) + R_i h(t-1) + b_Wi + b_Ru) # r(t) = sigmoid(W_r x(t) + R_r h(t-1) + b_Wr + b_Rr) --same up to here # h'(t) = tanh(W_h x(t) + r(t) .* (R_h h(t-1)) + b_Wh + b_Rh) --r applied after projection? Would make life easier! # h(t) = (1 - i(t) .* h'(t)) + i(t) .* h(t-1) --TODO: need to confirm bracketing with NVIDIA h = times(Sht(ht), Wmr) if has_projection else \ ht # returns the new state as a tuple with names but order matters return Function.NamedOutput(h=h)
def lstm(dh, dc, x): # projected contribution from input(s), hidden, and bias dropped_H = dropout(H) if weight_drop_rate is not None else H proj4 = b + times(x, W) + times(dh, dropped_H) # slicing layout different from cntk's implementation it_proj = slice(proj4, stack_axis, 0 * stacked_dim, 1 * stacked_dim) # split along stack_axis ft_proj = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim) bit_proj = slice(proj4, stack_axis, 2 * stacked_dim, 3 * stacked_dim) # g gate ot_proj = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim) it = sigmoid(it_proj) # input gate(t) bit = it * activation(bit_proj) # applied to tanh of input network ft = sigmoid(ft_proj) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid(ot_proj) # output gate(t) ht = ot * activation(ct) # applied to tanh(cell(t)) return ht, ct
def weight_dropped_lstm(dh, dc, x): dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + times(dhs, dropout(H)) it_proj = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim) # split along stack_axis bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim) ft_proj = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim) ot_proj = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim) # helper to inject peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid(peep(it_proj, dcs, Ci)) # input gate(t) # TODO: should both activations be replaced? bit = it * activation(bit_proj) # applied to tanh of input network ft = sigmoid(peep(ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid(peep(ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * activation(ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else ht return h, c
def lstm(dh, dc, sv, x): # projected contribution from input(s), hidden, and bias proj3 = b + times(x, W) + times(dh, H) + times(sv, Hsv) it_proj = slice(proj3, stack_axis, 0 * stacked_dim, 1 * stacked_dim) ft_proj = slice(proj3, stack_axis, 1 * stacked_dim, 2 * stacked_dim) ot_proj = slice(proj3, stack_axis, 2 * stacked_dim, 3 * stacked_dim) it = sigmoid(it_proj) # input gate(t) ft = sigmoid(ft_proj) # forget-me-not gate(t) ot = sigmoid(ot_proj) # output gate(t) # the following is reading gate proj3rg = sigmoid( times(x, Wrg) + times(dh, Hrg) + times(sv, Hsvrg) + brg) v = proj3rg * sv cx_t = tanh(times(x, Wcx) + times(dh, Hcx)) # need to do stablization ?? # update memory cell c = it * cx_t + ft * dc + tanh(times(v, Wfc)) h = ot * tanh(c) return (h, c, v)
def predict(model, params): """ Compute the prediction result of the given model """ model_args = {arg.name: arg for arg in model.arguments} context = model_args['context'] entity_ids_mask = model_args['entity_ids_mask'] entity_condition = greater(entity_ids_mask, 0, name='condidion') # Get all the enities in the paragraph via gather operator, which will create a new dynamic sequence axis entities_all = sequence.gather(entity_condition, entity_condition, name='entities_all') # The generated dynamic axis has the same length as the input enity id sequence, # so we asign it as the entity id's dynamic axis. entity_ids = input(shape=(params.entity_dim), is_sparse=True, dynamic_axes=entities_all.dynamic_axes, name='entity_ids') wordvocab_dim = params.vocab_dim answers = sequence.scatter(sequence.gather(model.outputs[-1], entity_condition), entities_all, name='Final_Ans') entity_id_matrix = ops.slice(ops.reshape(entity_ids, params.entity_dim), -1, 1, params.entity_dim) expand_pred = sequence.reduce_sum(element_times(answers, entity_id_matrix)) pred_max = ops.hardmax(expand_pred, name='pred_max') return pred_max
def _convolution(x): if group == 1: apply_x = _conv_ops(w, x) else: groups_data = [ops.slice(x, axis=0, begin_index=i * sub_input_channels, end_index=(i + 1) * sub_input_channels) for i in range(0, group)] apply_sub = [_conv_ops(group_kernel, group_data) for group_kernel, group_data in zip(groups_kernel, groups_data)] apply_x = ops.splice(*apply_sub, axis=0) if bias_init is not None: apply_x += b return apply_x
def lstm(dh, dc, x): dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + times(dhs, H) it_proj = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim) # split along stack_axis bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim) ft_proj = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim) ot_proj = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim) # helper to inject peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid (peep (it_proj, dcs, Ci)) # input gate(t) # TODO: should both activations be replaced? bit = it * activation (bit_proj) # applied to tanh of input network ft = sigmoid (peep (ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid (peep (ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * activation (ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else \ ht # returns the new state as a tuple with names but order matters return (Function.NamedOutput(h=h), Function.NamedOutput(c=c))
def _convolution(x): if group == 1: apply_x = _conv_ops(w, x) else: groups_data = [ ops.slice(x, axis=0, begin_index=i * sub_input_channels, end_index=(i + 1) * sub_input_channels) for i in range(0, group) ] apply_sub = [ _conv_ops(group_kernel, group_data) for group_kernel, group_data in zip( groups_kernel, groups_data) ] apply_x = ops.splice(*apply_sub, axis=0) if bias_init is not None: apply_x += b return apply_x
def sequence_to_sequence_translator(debug_output=False): input_vocab_dim = 69 label_vocab_dim = 69 hidden_dim = 512 num_layers = 2 # Source and target inputs to the model batch_axis = Axis.default_batch_axis() input_seq_axis = Axis("inputAxis") label_seq_axis = Axis("labelAxis") input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes) label_dynamic_axes = [batch_axis, label_seq_axis] raw_labels = input_variable(shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes) # Instantiate the sequence to sequence translation model input_sequence = raw_input # Drop the sentence start token from the label, for decoder training label_sequence = slice(raw_labels, label_seq_axis, 1, 0) label_sentence_start = sequence.first(raw_labels) is_first_label = sequence.is_first(label_sequence) label_sentence_start_scattered = sequence.scatter(label_sentence_start, is_first_label) # Encoder encoder_outputH = stabilize(input_sequence) for i in range(0, num_layers): (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( encoder_outputH.output(), hidden_dim, hidden_dim, future_value, future_value ) thought_vectorH = sequence.first(encoder_outputH) thought_vectorC = sequence.first(encoder_outputC) thought_vector_broadcastH = sequence.broadcast_as(thought_vectorH, label_sequence) thought_vector_broadcastC = sequence.broadcast_as(thought_vectorC, label_sequence) # Decoder decoder_history_from_ground_truth = label_sequence decoder_input = element_select( is_first_label, label_sentence_start_scattered, past_value(decoder_history_from_ground_truth) ) decoder_outputH = stabilize(decoder_input) for i in range(0, num_layers): if i > 0: recurrence_hookH = past_value recurrence_hookC = past_value else: isFirst = sequence.is_first(label_sequence) recurrence_hookH = lambda operand: element_select(isFirst, thought_vector_broadcastH, past_value(operand)) recurrence_hookC = lambda operand: element_select(isFirst, thought_vector_broadcastC, past_value(operand)) (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( decoder_outputH.output(), hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC ) decoder_output = decoder_outputH decoder_dim = hidden_dim # Softmax output layer z = linear_layer(stabilize(decoder_output), label_vocab_dim) ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence) # Instantiate the trainer object to drive the model training lr = 0.007 momentum_time_constant = 1100 momentum_per_sample = momentums_per_sample(math.exp(-1.0 / momentum_time_constant)) clipping_threshold_per_sample = 2.3 gradient_clipping_with_truncation = True trainer = Trainer( z, ce, errs, [ momentum_sgd( z.parameters(), lr, momentum_per_sample, clipping_threshold_per_sample, gradient_clipping_with_truncation, ) ], ) rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf" path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) feature_stream_name = "features" labels_stream_name = "labels" mb_source = text_format_minibatch_source( path, [ StreamConfiguration(feature_stream_name, input_vocab_dim, True, "S0"), StreamConfiguration(labels_stream_name, label_vocab_dim, True, "S1"), ], 10000, ) features_si = mb_source[feature_stream_name] labels_si = mb_source[labels_stream_name] # Get minibatches of sequences to train with and perform model training minibatch_size = 72 training_progress_output_freq = 30 if debug_output: training_progress_output_freq = training_progress_output_freq / 3 while True: mb = mb_source.get_next_minibatch(minibatch_size) if len(mb) == 0: break # Specify the mapping of input variables in the model to actual # minibatch data to be trained with arguments = {raw_input: mb[features_si], raw_labels: mb[labels_si]} trainer.train_minibatch(arguments) print_training_progress(trainer, i, training_progress_output_freq) i += 1 rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.test.ctf" path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) test_mb_source = text_format_minibatch_source( path, [ StreamConfiguration(feature_stream_name, input_vocab_dim, True, "S0"), StreamConfiguration(labels_stream_name, label_vocab_dim, True, "S1"), ], 10000, False, ) features_si = test_mb_source[feature_stream_name] labels_si = test_mb_source[labels_stream_name] # choose this to be big enough for the longest sentence train_minibatch_size = 1024 # Get minibatches of sequences to test and perform testing i = 0 total_error = 0.0 while True: mb = test_mb_source.get_next_minibatch(train_minibatch_size) if len(mb) == 0: break # Specify the mapping of input variables in the model to actual # minibatch data to be tested with arguments = {raw_input: mb[features_si], raw_labels: mb[labels_si]} mb_error = trainer.test_minibatch(arguments) total_error += mb_error if debug_output: print("Minibatch {}, Error {} ".format(i, mb_error)) i += 1 # Average of evaluation errors of all test minibatches return total_error / i
def LSTM(shape, cell_shape=None, use_peepholes=use_peepholes_default_or_False, init=init_default_or_glorot_uniform, init_bias=init_bias_default_or_0, enable_self_stabilization=enable_self_stabilization_default_or_False): # (x, (h, c)) use_peepholes = use_peepholes if _is_given(use_peepholes) else _current_default_options.use_peepholes enable_self_stabilization = enable_self_stabilization if _is_given(enable_self_stabilization) else _current_default_options.enable_self_stabilization has_projection = cell_shape is not None has_aux = False if has_aux: UntestedBranchError("LSTM, has_aux option") shape = _as_tuple(shape) cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape if len(shape) != 1 or len(cell_shape) != 1: raise ValueError("LSTM: shape and cell_shape must be vectors (rank-1 tensors)") # otherwise we'd need to fix slicing and Param initializers stack_axis = -1 # stacking along the fastest-changing one, to match BS # determine stacking dimensions cell_shape_list = list(cell_shape) stacked_dim = cell_shape_list[0] cell_shape_list[stack_axis] = stacked_dim*4 cell_shape_stacked = tuple(cell_shape_list) # patched dims with stack_axis duplicated 4 times # parameters b = Parameter( cell_shape_stacked, init=init_bias, name='b') # a bias W = Parameter(_INFERRED + cell_shape_stacked, init=init, name='W') # input A = Parameter(_INFERRED + cell_shape_stacked, init=init, name='A') if has_aux else None # aux input (optional) H = Parameter(shape + cell_shape_stacked, init=init, name='H') # hidden-to-hidden Ci = Parameter( cell_shape, init=init, name='Ci') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Cf = Parameter( cell_shape, init=init, name='Cf') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Co = Parameter( cell_shape, init=init, name='Co') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Wmr = Parameter(cell_shape + shape, init=init) if has_projection else None # final projection Sdh = Stabilizer() if enable_self_stabilization else identity Sdc = Stabilizer() if enable_self_stabilization else identity Sct = Stabilizer() if enable_self_stabilization else identity Sht = Stabilizer() if enable_self_stabilization else identity def create_hc_placeholder(): # we pass the known dimensions here, which makes dimension inference easier return (Placeholder(shape=shape, name='hPh'), Placeholder(shape=cell_shape, name='cPh')) # (h, c) # parameters to model function x = Placeholder(name='lstm_block_arg') prev_state = create_hc_placeholder() # formula of model function dh, dc = prev_state dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + times(dhs, H) + times(aux, A) if has_aux else \ b + times(x, W) + times(dhs, H) it_proj = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim) # split along stack_axis bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim) ft_proj = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim) ot_proj = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim) # add peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid (peep (it_proj, dcs, Ci)) # input gate(t) bit = it * tanh (bit_proj) # applied to tanh of input network ft = sigmoid (peep (ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid (peep (ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * tanh (ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else \ ht _name_node(h, 'h') if _trace_layers: _log_node(h) # this looks right _name_node(c, 'c') # TODO: figure out how to do scoping, and also rename all the apply... to expression apply_x_h_c = combine ([h, c]) # return to caller a helper function to create placeholders for recurrence # Note that this function will only exist in the object returned here, but not any cloned version of it. apply_x_h_c.create_placeholder = create_hc_placeholder #return Block(apply_x_h_c, 'LSTM') # BUGBUG: fails with "RuntimeError: A Function instance with more than one output cannot be implicitly converted to a Variable" return apply_x_h_c
def train_sequence_to_sequence_translator(): input_vocab_dim = 69 label_vocab_dim = 69 hidden_dim = 512 num_layers = 2 # Source and target inputs to the model input_dynamic_axes = [ Axis('inputAxis'), Axis.default_batch_axis() ] raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes = input_dynamic_axes) label_dynamic_axes = [ Axis('labelAxis'), Axis.default_batch_axis() ] raw_labels = input_variable(shape=(label_vocab_dim), dynamic_axes = label_dynamic_axes) # Instantiate the sequence to sequence translation model input_sequence = raw_input # Drop the sentence start token from the label, for decoder training label_sequence = slice(raw_labels, label_dynamic_axes[0], 1, 0) label_sentence_start = sequence.first(raw_labels) is_first_label = sequence.is_first(label_sequence) label_sentence_start_scattered = sequence.scatter(label_sentence_start, is_first_label) # Encoder encoder_outputH = stabilize(input_sequence) for i in range(0, num_layers): (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(encoder_outputH, hidden_dim, hidden_dim, future_value, future_value) thought_vectorH = sequence.first(encoder_outputH) thought_vectorC = sequence.first(encoder_outputC) thought_vector_broadcastH = sequence.broadcast_as(thought_vectorH, label_sequence) thought_vector_broadcastC = sequence.broadcast_as(thought_vectorC, label_sequence) # Decoder decoder_history_from_ground_truth = label_sequence decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(decoder_history_from_ground_truth)) decoder_outputH = stabilize(decoder_input) for i in range(0, num_layers): if (i == 0): recurrence_hookH = past_value recurrence_hookC = past_value else: isFirst = sequence.is_first(label_sequence) recurrence_hookH = lambda operand: element_select(isFirst, thought_vector_broadcastH, past_value(operand)) recurrence_hookC = lambda operand: element_select(isFirst, thought_vector_broadcastC, past_value(operand)) (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(decoder_outputH, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC) decoder_output = decoder_outputH decoder_dim = hidden_dim # Softmax output layer z = linear_layer(stabilize(decoder_output), label_vocab_dim) ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence) rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf" path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) feature_stream_name = 'features' labels_stream_name = 'labels' mb_source = text_format_minibatch_source(path, [ StreamConfiguration( feature_stream_name, input_vocab_dim, True, 'S0' ), StreamConfiguration( labels_stream_name, label_vocab_dim, True, 'S1') ], 10000) features_si = mb_source.stream_info(feature_stream_name) labels_si = mb_source.stream_info(labels_stream_name) # Instantiate the trainer object to drive the model training lr = learning_rates_per_sample(0.007) momentum_time_constant = 1100 momentum_per_sample = momentums_per_sample(math.exp(-1.0 / momentum_time_constant)) clipping_threshold_per_sample = 2.3 gradient_clipping_with_truncation = True trainer = Trainer(z, ce, errs, [momentum_sgd_learner(z.owner.parameters(), lr, momentum_per_sample, clipping_threshold_per_sample, gradient_clipping_with_truncation)]) # Get minibatches of sequences to train with and perform model training minibatch_size = 72 training_progress_output_freq = 10 while True: mb = mb_source.get_next_minibatch(minibatch_size) if len(mb) == 0: break # Specify the mapping of input variables in the model to actual minibatch data to be trained with arguments = {raw_input : mb[features_si].m_data, raw_labels : mb[labels_si].m_data} trainer.train_minibatch(arguments) print_training_progress(trainer, i, training_progress_output_freq) i += 1
def sequence_to_sequence_translator(debug_output=False, run_test=False): input_vocab_dim = 69 label_vocab_dim = 69 # network complexity; initially low for faster testing hidden_dim = 256 num_layers = 1 # Source and target inputs to the model batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes, name='raw_input') label_dynamic_axes = [batch_axis, label_seq_axis] raw_labels = input_variable(shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes, name='raw_labels') # Instantiate the sequence to sequence translation model input_sequence = raw_input # Drop the sentence start token from the label, for decoder training label_sequence = slice(raw_labels, label_seq_axis, 1, 0) # <s> A B C </s> --> A B C </s> label_sentence_start = sequence.first(raw_labels) # <s> is_first_label = sequence.is_first(label_sequence) # <s> 0 0 0 ... label_sentence_start_scattered = sequence.scatter(label_sentence_start, is_first_label) # Encoder encoder_outputH = stabilize(input_sequence) for i in range(0, num_layers): (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( encoder_outputH.output, hidden_dim, hidden_dim, future_value, future_value) thought_vectorH = sequence.first(encoder_outputH) thought_vectorC = sequence.first(encoder_outputC) thought_vector_broadcastH = sequence.broadcast_as(thought_vectorH, label_sequence) thought_vector_broadcastC = sequence.broadcast_as(thought_vectorC, label_sequence) # Decoder decoder_history_hook = alias( label_sequence, name='decoder_history_hook') # copy label_sequence decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(decoder_history_hook)) decoder_outputH = stabilize(decoder_input) for i in range(0, num_layers): if (i > 0): recurrence_hookH = past_value recurrence_hookC = past_value else: isFirst = sequence.is_first(label_sequence) recurrence_hookH = lambda operand: element_select( isFirst, thought_vector_broadcastH, past_value(operand)) recurrence_hookC = lambda operand: element_select( isFirst, thought_vector_broadcastC, past_value(operand)) (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC) decoder_output = decoder_outputH # Softmax output layer z = linear_layer(stabilize(decoder_output), label_vocab_dim) # Criterion nodes ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence) # network output for decoder history net_output = hardmax(z) # make a clone of the graph where the ground truth is replaced by the network output ng = z.clone(CloneMethod.share, {decoder_history_hook.output: net_output.output}) # Instantiate the trainer object to drive the model training lr = 0.007 minibatch_size = 72 momentum_time_constant = 1100 m_schedule = momentum_schedule(momentum_time_constant) clipping_threshold_per_sample = 2.3 gradient_clipping_with_truncation = True learner = momentum_sgd(z.parameters, lr, m_schedule, clipping_threshold_per_sample, gradient_clipping_with_truncation) trainer = Trainer(z, ce, errs, learner) # setup data rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf" train_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) valid_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tiny.ctf") feature_stream_name = 'features' labels_stream_name = 'labels' # readers randomize_data = True if run_test: randomize_data = False # because we want to get an exact error train_reader = text_format_minibatch_source(train_path, [ StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'), StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1') ], randomize=randomize_data) features_si_tr = train_reader.stream_info(feature_stream_name) labels_si_tr = train_reader.stream_info(labels_stream_name) valid_reader = text_format_minibatch_source(valid_path, [ StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'), StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1') ], randomize=False) features_si_va = valid_reader.stream_info(feature_stream_name) labels_si_va = valid_reader.stream_info(labels_stream_name) # get the vocab for printing output sequences in plaintext rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.mapping" vocab_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) vocab = [w.strip() for w in open(vocab_path).readlines()] i2w = {i: ch for i, ch in enumerate(vocab)} # Get minibatches of sequences to train with and perform model training i = 0 mbs = 0 epoch_size = 908241 max_epochs = 10 training_progress_output_freq = 500 # make things more basic for running a quicker test if run_test: epoch_size = 5000 max_epochs = 1 training_progress_output_freq = 30 for epoch in range(max_epochs): loss_numer = 0 metric_numer = 0 denom = 0 while i < (epoch + 1) * epoch_size: # get next minibatch of training data mb_train = train_reader.next_minibatch(minibatch_size) train_args = { 'raw_input': mb_train[features_si_tr], 'raw_labels': mb_train[labels_si_tr] } trainer.train_minibatch(train_args) # collect epoch-wide stats samples = trainer.previous_minibatch_sample_count loss_numer += trainer.previous_minibatch_loss_average * samples metric_numer += trainer.previous_minibatch_evaluation_average * samples denom += samples # every N MBs evaluate on a test sequence to visually show how we're doing if mbs % training_progress_output_freq == 0: mb_valid = valid_reader.next_minibatch(minibatch_size) valid_args = { 'raw_input': mb_valid[features_si_va], 'raw_labels': mb_valid[labels_si_va] } e = ng.eval(valid_args) print_sequences(e, i2w) print_training_progress(trainer, mbs, training_progress_output_freq) i += mb_train[labels_si_tr].num_samples mbs += 1 print("--- EPOCH %d DONE: loss = %f, errs = %f ---" % (epoch, loss_numer / denom, 100.0 * (metric_numer / denom))) # now setup a test run rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.test.ctf" test_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) test_reader = text_format_minibatch_source(test_path, [ StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'), StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1') ], 10000, randomize=False) features_si_te = test_reader.stream_info(feature_stream_name) labels_si_te = test_reader.stream_info(labels_stream_name) test_minibatch_size = 1024 # Get minibatches of sequences to test and perform testing i = 0 total_error = 0.0 while True: mb = test_reader.next_minibatch(test_minibatch_size) if len(mb) == 0: break # Specify the mapping of input variables in the model to actual # minibatch data to be tested with arguments = { raw_input: mb[features_si_te], raw_labels: mb[labels_si_te] } mb_error = trainer.test_minibatch(arguments) total_error += mb_error if debug_output: print("Minibatch {}, Error {} ".format(i, mb_error)) i += 1 # Average of evaluation errors of all test minibatches return total_error / i
def sequence_to_sequence_translator(debug_output=False, run_test=False): input_vocab_dim = 69 label_vocab_dim = 69 # network complexity; initially low for faster testing hidden_dim = 256 num_layers = 1 # Source and target inputs to the model batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable( shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes, name='raw_input') label_dynamic_axes = [batch_axis, label_seq_axis] raw_labels = input_variable( shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes, name='raw_labels') # Instantiate the sequence to sequence translation model input_sequence = raw_input # Drop the sentence start token from the label, for decoder training label_sequence = slice(raw_labels, label_seq_axis, 1, 0) # <s> A B C </s> --> A B C </s> label_sentence_start = sequence.first(raw_labels) # <s> is_first_label = sequence.is_first(label_sequence) # <s> 0 0 0 ... label_sentence_start_scattered = sequence.scatter( label_sentence_start, is_first_label) # Encoder encoder_outputH = stabilize(input_sequence) for i in range(0, num_layers): (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( encoder_outputH.output, hidden_dim, hidden_dim, future_value, future_value) thought_vectorH = sequence.first(encoder_outputH) thought_vectorC = sequence.first(encoder_outputC) thought_vector_broadcastH = sequence.broadcast_as( thought_vectorH, label_sequence) thought_vector_broadcastC = sequence.broadcast_as( thought_vectorC, label_sequence) # Decoder decoder_history_hook = alias(label_sequence, name='decoder_history_hook') # copy label_sequence decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value( decoder_history_hook)) decoder_outputH = stabilize(decoder_input) for i in range(0, num_layers): if (i > 0): recurrence_hookH = past_value recurrence_hookC = past_value else: isFirst = sequence.is_first(label_sequence) recurrence_hookH = lambda operand: element_select( isFirst, thought_vector_broadcastH, past_value(operand)) recurrence_hookC = lambda operand: element_select( isFirst, thought_vector_broadcastC, past_value(operand)) (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC) decoder_output = decoder_outputH # Softmax output layer z = linear_layer(stabilize(decoder_output), label_vocab_dim) # Criterion nodes ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence) # network output for decoder history net_output = hardmax(z) # make a clone of the graph where the ground truth is replaced by the network output ng = z.clone(CloneMethod.share, {decoder_history_hook.output : net_output.output}) # Instantiate the trainer object to drive the model training lr = 0.007 minibatch_size = 72 momentum_time_constant = 1100 m_schedule = momentum_schedule(momentum_time_constant) clipping_threshold_per_sample = 2.3 gradient_clipping_with_truncation = True learner = momentum_sgd(z.parameters, lr, m_schedule, clipping_threshold_per_sample, gradient_clipping_with_truncation) trainer = Trainer(z, ce, errs, learner) # setup data rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf" train_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) valid_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tiny.ctf") # readers randomize_data = True if run_test: randomize_data = False # because we want to get an exact error train_reader = create_reader(train_path, randomize_data, input_vocab_dim, label_vocab_dim) train_bind = { raw_input : train_reader.streams.features, raw_labels : train_reader.streams.labels } # get the vocab for printing output sequences in plaintext rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.mapping" vocab_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) vocab = [w.strip() for w in open(vocab_path).readlines()] i2w = { i:ch for i,ch in enumerate(vocab) } # Get minibatches of sequences to train with and perform model training i = 0 mbs = 0 epoch_size = 908241 max_epochs = 10 training_progress_output_freq = 500 # make things more basic for running a quicker test if run_test: epoch_size = 5000 max_epochs = 1 training_progress_output_freq = 30 valid_reader = create_reader(valid_path, False, input_vocab_dim, label_vocab_dim) valid_bind = { find_arg_by_name('raw_input',ng) : valid_reader.streams.features, find_arg_by_name('raw_labels',ng) : valid_reader.streams.labels } for epoch in range(max_epochs): loss_numer = 0 metric_numer = 0 denom = 0 while i < (epoch+1) * epoch_size: # get next minibatch of training data mb_train = train_reader.next_minibatch(minibatch_size, input_map=train_bind) trainer.train_minibatch(mb_train) # collect epoch-wide stats samples = trainer.previous_minibatch_sample_count loss_numer += trainer.previous_minibatch_loss_average * samples metric_numer += trainer.previous_minibatch_evaluation_average * samples denom += samples # every N MBs evaluate on a test sequence to visually show how we're doing if mbs % training_progress_output_freq == 0: mb_valid = valid_reader.next_minibatch(minibatch_size, input_map=valid_bind) e = ng.eval(mb_valid) print_sequences(e, i2w) print_training_progress(trainer, mbs, training_progress_output_freq) i += mb_train[raw_labels].num_samples mbs += 1 print("--- EPOCH %d DONE: loss = %f, errs = %f ---" % (epoch, loss_numer/denom, 100.0*(metric_numer/denom))) error1 = translator_test_error(z, trainer, input_vocab_dim, label_vocab_dim) save_model(z, "seq2seq.dnn") z = load_model("seq2seq.dnn") label_seq_axis = Axis('labelAxis') label_sequence = slice(find_arg_by_name('raw_labels',z), label_seq_axis, 1, 0) ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence) trainer = Trainer(z, ce, errs, [momentum_sgd( z.parameters, lr, m_schedule, clipping_threshold_per_sample, gradient_clipping_with_truncation)]) error2 = translator_test_error(z, trainer, input_vocab_dim, label_vocab_dim) assert error1 == error2 return error1
def create_model(params: model_params): """ Create ReasoNet model Args: params (class:`model_params`): The parameters used to create the model """ logger.log( "Create model: dropout_rate: {0}, init:{1}, embedding_init: {2}". format(params.dropout_rate, params.init, params.embedding_init)) # Query and Doc/Context/Paragraph inputs to the model query_seq_axis = Axis('sourceAxis') context_seq_axis = Axis('contextAxis') query_sequence = sequence.input(shape=(params.vocab_dim), is_sparse=True, sequence_axis=query_seq_axis, name='query') context_sequence = sequence.input(shape=(params.vocab_dim), is_sparse=True, sequence_axis=context_seq_axis, name='context') entity_ids_mask = sequence.input(shape=(1, ), is_sparse=False, sequence_axis=context_seq_axis, name='entity_ids_mask') # embedding if params.embedding_init is None: embedding_init = create_random_matrix(params.vocab_dim, params.embedding_dim) else: embedding_init = params.embedding_init embedding = parameter(shape=(params.vocab_dim, params.embedding_dim), init=None) embedding.value = embedding_init embedding_matrix = constant(embedding_init, shape=(params.vocab_dim, params.embedding_dim)) if params.dropout_rate is not None: query_embedding = ops.dropout(times(query_sequence, embedding), params.dropout_rate, name='query_embedding') context_embedding = ops.dropout(times(context_sequence, embedding), params.dropout_rate, name='context_embedding') else: query_embedding = times(query_sequence, embedding, name='query_embedding') context_embedding = times(context_sequence, embedding, name='context_embedding') contextGruW = Parameter(_INFERRED + _as_tuple(params.hidden_dim), init=glorot_uniform(), name='gru_params') queryGruW = Parameter(_INFERRED + _as_tuple(params.hidden_dim), init=glorot_uniform(), name='gru_params') entity_embedding = ops.times(context_sequence, embedding_matrix, name='constant_entity_embedding') # Unlike other words in the context, we keep the entity vectors fixed as a random vector so that each vector just means an identifier of different entities in the context and it has no semantic meaning full_context_embedding = ops.element_select(entity_ids_mask, entity_embedding, context_embedding) context_memory = ops.optimized_rnnstack(full_context_embedding, contextGruW, params.hidden_dim, 1, True, recurrent_op='gru', name='context_mem') query_memory = ops.optimized_rnnstack(query_embedding, queryGruW, params.hidden_dim, 1, True, recurrent_op='gru', name='query_mem') qfwd = ops.slice(sequence.last(query_memory), -1, 0, params.hidden_dim, name='fwd') qbwd = ops.slice(sequence.first(query_memory), -1, params.hidden_dim, params.hidden_dim * 2, name='bwd') init_status = ops.splice( qfwd, qbwd, name='Init_Status') # get last fwd status and first bwd status return attention_model(context_memory, query_memory, init_status, params.hidden_dim, params.attention_dim, max_steps=params.max_rl_steps)
def sequence_to_sequence_translator(debug_output=False): input_vocab_dim = 69 label_vocab_dim = 69 hidden_dim = 512 num_layers = 2 # Source and target inputs to the model batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable( shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes) label_dynamic_axes = [batch_axis, label_seq_axis] raw_labels = input_variable( shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes) # Instantiate the sequence to sequence translation model input_sequence = raw_input # Drop the sentence start token from the label, for decoder training label_sequence = slice(raw_labels, label_seq_axis, 1, 0) label_sentence_start = sequence.first(raw_labels) is_first_label = sequence.is_first(label_sequence) label_sentence_start_scattered = sequence.scatter( label_sentence_start, is_first_label) # Encoder encoder_outputH = stabilize(input_sequence) for i in range(0, num_layers): (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( encoder_outputH.output, hidden_dim, hidden_dim, future_value, future_value) thought_vectorH = sequence.first(encoder_outputH) thought_vectorC = sequence.first(encoder_outputC) thought_vector_broadcastH = sequence.broadcast_as( thought_vectorH, label_sequence) thought_vector_broadcastC = sequence.broadcast_as( thought_vectorC, label_sequence) # Decoder decoder_history_from_ground_truth = label_sequence decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value( decoder_history_from_ground_truth)) decoder_outputH = stabilize(decoder_input) for i in range(0, num_layers): if (i > 0): recurrence_hookH = past_value recurrence_hookC = past_value else: isFirst = sequence.is_first(label_sequence) recurrence_hookH = lambda operand: element_select( isFirst, thought_vector_broadcastH, past_value(operand)) recurrence_hookC = lambda operand: element_select( isFirst, thought_vector_broadcastC, past_value(operand)) (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC) decoder_output = decoder_outputH decoder_dim = hidden_dim # Softmax output layer z = linear_layer(stabilize(decoder_output), label_vocab_dim) ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence) # Instantiate the trainer object to drive the model training lr = 0.007 momentum_time_constant = 1100 m_schedule = momentum_schedule(momentum_time_constant) clipping_threshold_per_sample = 2.3 gradient_clipping_with_truncation = True trainer = Trainer(z, ce, errs, [momentum_sgd(z.parameters, lr, m_schedule, clipping_threshold_per_sample, gradient_clipping_with_truncation)]) rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf" path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) feature_stream_name = 'features' labels_stream_name = 'labels' mb_source = text_format_minibatch_source(path, [ StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'), StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1')], 10000) features_si = mb_source[feature_stream_name] labels_si = mb_source[labels_stream_name] # Get minibatches of sequences to train with and perform model training minibatch_size = 72 training_progress_output_freq = 30 if debug_output: training_progress_output_freq = training_progress_output_freq/3 while True: mb = mb_source.next_minibatch(minibatch_size) if len(mb) == 0: break # Specify the mapping of input variables in the model to actual # minibatch data to be trained with arguments = {raw_input: mb[features_si], raw_labels: mb[labels_si]} trainer.train_minibatch(arguments) print_training_progress(trainer, i, training_progress_output_freq) i += 1 rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.test.ctf" path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) test_mb_source = text_format_minibatch_source(path, [ StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'), StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1')], 10000, False) features_si = test_mb_source[feature_stream_name] labels_si = test_mb_source[labels_stream_name] # choose this to be big enough for the longest sentence train_minibatch_size = 1024 # Get minibatches of sequences to test and perform testing i = 0 total_error = 0.0 while True: mb = test_mb_source.next_minibatch(train_minibatch_size) if len(mb) == 0: break # Specify the mapping of input variables in the model to actual # minibatch data to be tested with arguments = {raw_input: mb[features_si], raw_labels: mb[labels_si]} mb_error = trainer.test_minibatch(arguments) total_error += mb_error if debug_output: print("Minibatch {}, Error {} ".format(i, mb_error)) i += 1 # Average of evaluation errors of all test minibatches return total_error / i
def sequence_to_sequence_translator(debug_output=False, run_test=False): input_vocab_dim = 69 label_vocab_dim = 69 # network complexity; initially low for faster testing hidden_dim = 256 num_layers = 1 # Source and target inputs to the model batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes, name='raw_input') label_dynamic_axes = [batch_axis, label_seq_axis] raw_labels = input_variable(shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes, name='raw_labels') # Instantiate the sequence to sequence translation model input_sequence = raw_input # Drop the sentence start token from the label, for decoder training label_sequence = slice(raw_labels, label_seq_axis, 1, 0) # <s> A B C </s> --> A B C </s> label_sentence_start = sequence.first(raw_labels) # <s> is_first_label = sequence.is_first(label_sequence) # <s> 0 0 0 ... label_sentence_start_scattered = sequence.scatter(label_sentence_start, is_first_label) # Encoder encoder_outputH = stabilize(input_sequence) for i in range(0, num_layers): (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( encoder_outputH.output, hidden_dim, hidden_dim, future_value, future_value) thought_vectorH = sequence.first(encoder_outputH) thought_vectorC = sequence.first(encoder_outputC) thought_vector_broadcastH = sequence.broadcast_as(thought_vectorH, label_sequence) thought_vector_broadcastC = sequence.broadcast_as(thought_vectorC, label_sequence) # Decoder decoder_history_hook = alias( label_sequence, name='decoder_history_hook') # copy label_sequence decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(decoder_history_hook)) decoder_outputH = stabilize(decoder_input) for i in range(0, num_layers): if (i > 0): recurrence_hookH = past_value recurrence_hookC = past_value else: isFirst = sequence.is_first(label_sequence) recurrence_hookH = lambda operand: element_select( isFirst, thought_vector_broadcastH, past_value(operand)) recurrence_hookC = lambda operand: element_select( isFirst, thought_vector_broadcastC, past_value(operand)) (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC) decoder_output = decoder_outputH # Softmax output layer z = linear_layer(stabilize(decoder_output), label_vocab_dim) # Criterion nodes ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence) # network output for decoder history net_output = hardmax(z) # make a clone of the graph where the ground truth is replaced by the network output ng = z.clone(CloneMethod.share, {decoder_history_hook.output: net_output.output}) # Instantiate the trainer object to drive the model training lr_per_sample = 0.007 minibatch_size = 72 momentum_time_constant = momentum_as_time_constant_schedule(1100) clipping_threshold_per_sample = 2.3 gradient_clipping_with_truncation = True learner = momentum_sgd( z.parameters, lr_per_sample, momentum_time_constant, gradient_clipping_threshold_per_sample=clipping_threshold_per_sample, gradient_clipping_with_truncation=gradient_clipping_with_truncation) trainer = Trainer(z, ce, errs, learner) # setup data rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf" train_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) valid_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tiny.ctf") # readers randomize_data = True if run_test: randomize_data = False # because we want to get an exact error train_reader = create_reader(train_path, randomize_data, input_vocab_dim, label_vocab_dim) train_bind = { raw_input: train_reader.streams.features, raw_labels: train_reader.streams.labels } # get the vocab for printing output sequences in plaintext rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.mapping" vocab_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) vocab = [w.strip() for w in open(vocab_path).readlines()] i2w = {i: ch for i, ch in enumerate(vocab)} # Get minibatches of sequences to train with and perform model training i = 0 mbs = 0 epoch_size = 908241 max_epochs = 10 training_progress_output_freq = 500 # make things more basic for running a quicker test if run_test: epoch_size = 5000 max_epochs = 1 training_progress_output_freq = 30 valid_reader = create_reader(valid_path, False, input_vocab_dim, label_vocab_dim) valid_bind = { find_arg_by_name('raw_input', ng): valid_reader.streams.features, find_arg_by_name('raw_labels', ng): valid_reader.streams.labels } for epoch in range(max_epochs): loss_numer = 0 metric_numer = 0 denom = 0 while i < (epoch + 1) * epoch_size: # get next minibatch of training data mb_train = train_reader.next_minibatch(minibatch_size, input_map=train_bind) trainer.train_minibatch(mb_train) # collect epoch-wide stats samples = trainer.previous_minibatch_sample_count loss_numer += trainer.previous_minibatch_loss_average * samples metric_numer += trainer.previous_minibatch_evaluation_average * samples denom += samples # every N MBs evaluate on a test sequence to visually show how we're doing if mbs % training_progress_output_freq == 0: mb_valid = valid_reader.next_minibatch(minibatch_size, input_map=valid_bind) e = ng.eval(mb_valid) print_sequences(e, i2w) print_training_progress(trainer, mbs, training_progress_output_freq) i += mb_train[raw_labels].num_samples mbs += 1 print("--- EPOCH %d DONE: loss = %f, errs = %f ---" % (epoch, loss_numer / denom, 100.0 * (metric_numer / denom))) error1 = translator_test_error(z, trainer, input_vocab_dim, label_vocab_dim) save_model(z, "seq2seq.dnn") z = load_model("seq2seq.dnn") label_seq_axis = Axis('labelAxis') label_sequence = slice(find_arg_by_name('raw_labels', z), label_seq_axis, 1, 0) ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence) trainer = Trainer(z, ce, errs, [ momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, clipping_threshold_per_sample, gradient_clipping_with_truncation) ]) error2 = translator_test_error(z, trainer, input_vocab_dim, label_vocab_dim) assert error1 == error2 return error1