def test_conditional_attention_gru_recurrent(): random_state = np.random.RandomState(1999) graph = OrderedDict() n_hid = 5 n_out = n_chars # input (where first dimension is time) datasets_list = [X_mb, X_mask, y_mb, y_mask] names_list = ["X", "X_mask", "y", "y_mask"] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph) h = bidirectional_gru_recurrent_layer([X_sym], X_mask_sym, n_hid, graph, 'l1_end', random_state) shifted_y_sym = shift_layer([y_sym], graph, 'shift') h_dec, context, attention = conditional_attention_gru_recurrent_layer( [y_sym], [h], y_mask_sym, X_mask_sym, n_hid, graph, 'l2_dec', random_state) # linear output activation y_hat = softmax_layer([h_dec, context, shifted_y_sym], graph, 'l2_proj', n_out, random_state=random_state) # error between output and target cost = categorical_crossentropy(y_hat, y_sym) cost = masked_cost(cost, y_mask_sym).mean() # Parameters of the model """ params, grads = get_params_and_grads(graph, cost) # Use stochastic gradient descent to optimize opt = sgd(params) learning_rate = 0.00000 updates = opt.updates(params, grads, learning_rate) fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates, mode="FAST_COMPILE") """ cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(len(X)) valid_indices = np.arange(len(X)) early_stopping_trainer(cost_function, cost_function, train_indices, valid_indices, checkpoint_dict, [X, y], minibatch_size, list_of_minibatch_functions=[text_minibatch_func], list_of_train_output_names=["cost"], valid_output_name="valid_cost", n_epochs=1)
def test_conditional_attention_gru_recurrent(): random_state = np.random.RandomState(1999) graph = OrderedDict() n_hid = 5 n_out = n_chars # input (where first dimension is time) datasets_list = [X_mb, X_mask, y_mb, y_mask] names_list = ["X", "X_mask", "y", "y_mask"] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph) h = bidirectional_gru_recurrent_layer([X_sym], X_mask_sym, n_hid, graph, 'l1_end', random_state) shifted_y_sym = shift_layer([y_sym], graph, 'shift') h_dec, context, attention = conditional_attention_gru_recurrent_layer( [y_sym], [h], y_mask_sym, X_mask_sym, n_hid, graph, 'l2_dec', random_state) # linear output activation y_hat = softmax_layer([h_dec, context, shifted_y_sym], graph, 'l2_proj', n_out, random_state) # error between output and target cost = categorical_crossentropy(y_hat, y_sym) cost = masked_cost(cost, y_mask_sym).mean() # Parameters of the model """ params, grads = get_params_and_grads(graph, cost) # Use stochastic gradient descent to optimize opt = sgd(params) learning_rate = 0.00000 updates = opt.updates(params, grads, learning_rate) fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates, mode="FAST_COMPILE") """ cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(len(X)) valid_indices = np.arange(len(X)) early_stopping_trainer(cost_function, cost_function, checkpoint_dict, [X, y], minibatch_size, train_indices, valid_indices, list_of_minibatch_functions=[text_minibatch_func], fit_function_output_names=["cost"], cost_function_output_name="valid_cost", n_epochs=1)
minibatch_size = 10 X_mb, X_mb_mask = make_masked_minibatch(X, slice(0, minibatch_size)) y_mb, y_mb_mask = make_masked_minibatch(y, slice(0, minibatch_size)) n_hid = 500 n_out = vocab_size + 1 datasets_list = [X_mb, X_mb_mask, y_mb, y_mb_mask] names_list = ["X", "X_mask", "y", "y_mask"] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph) l1 = maxout_layer([X_sym], graph, 'l1', n_hid, random_state=random_state) h = bidirectional_gru_recurrent_layer([l1], X_mask_sym, n_hid, graph, 'l1_rec', random_state=random_state) l2 = maxout_layer([h], graph, 'l2', n_hid, random_state=random_state) y_pred = softmax_layer([l2], graph, 'softmax', n_out, random_state=random_state) cost = log_ctc_cost(y_sym, y_mask_sym, y_pred, X_mask_sym).mean() params, grads = get_params_and_grads(graph, cost) opt = adadelta(params) updates = opt.updates(params, grads) checkpoint_dict = {} fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates) cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost]) predict_function = theano.function([X_sym, X_mask_sym], [y_pred])
X_mb, X_mb_mask = make_masked_minibatch(X, slice(0, minibatch_size)) y_mb, y_mb_mask = make_masked_minibatch(y, slice(0, minibatch_size)) n_hid = 500 n_out = vocab_size + 1 datasets_list = [X_mb, X_mb_mask, y_mb, y_mb_mask] names_list = ["X", "X_mask", "y", "y_mask"] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph) l1 = maxout_layer([X_sym], graph, 'l1', n_hid, random_state=random_state) h = bidirectional_gru_recurrent_layer([l1], X_mask_sym, n_hid, graph, 'l1_rec', random_state=random_state) l2 = maxout_layer([h], graph, 'l2', n_hid, random_state=random_state) y_pred = softmax_layer([l2], graph, 'softmax', n_out, random_state=random_state) cost = log_ctc_cost(y_sym, y_mask_sym, y_pred, X_mask_sym).mean() params, grads = get_params_and_grads(graph, cost) opt = adadelta(params) updates = opt.updates(params, grads)