def test_feedforward_theano_mix(): minibatch_size = 100 random_state = np.random.RandomState(1999) graph = OrderedDict() X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph) l1_o = linear_layer([X_sym], graph, 'l1', proj_dim=20, random_state=random_state) l1_o = .999 * l1_o y_pred = softmax_layer([l1_o], graph, 'pred', n_classes, random_state=random_state) cost = categorical_crossentropy(y_pred, y_sym).mean() params, grads = get_params_and_grads(graph, cost) learning_rate = 0.001 opt = sgd(params) updates = opt.updates(params, grads, learning_rate) fit_function = theano.function([X_sym, y_sym], [cost], updates=updates, mode="FAST_COMPILE") cost_function = theano.function([X_sym, y_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(len(X)) valid_indices = np.arange(len(X)) early_stopping_trainer(fit_function, cost_function, checkpoint_dict, [X, y], minibatch_size, train_indices, valid_indices, fit_function_output_names=["cost"], cost_function_output_name="valid_cost", n_epochs=1)
def test_conditional_gru_recurrent(): random_state = np.random.RandomState(1999) graph = OrderedDict() n_hid = 5 n_out = n_chars # input (where first dimension is time) datasets_list = [X_mb, X_mask, y_mb, y_mask] names_list = ["X", "X_mask", "y", "y_mask"] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph) h = gru_recurrent_layer([X_sym], X_mask_sym, n_hid, graph, 'l1_end', random_state) shifted_y_sym = shift_layer([y_sym], graph, 'shift') h_dec, context = conditional_gru_recurrent_layer([y_sym], [h], y_mask_sym, n_hid, graph, 'l2_dec', random_state) # linear output activation y_hat = softmax_layer([h_dec, context, shifted_y_sym], graph, 'l2_proj', n_out, random_state=random_state) # error between output and target cost = categorical_crossentropy(y_hat, y_sym) cost = masked_cost(cost, y_mask_sym).mean() # Parameters of the model """ params, grads = get_params_and_grads(graph, cost) # Use stochastic gradient descent to optimize opt = sgd(params) learning_rate = 0.00000 updates = opt.updates(params, grads, learning_rate) fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates, mode="FAST_COMPILE") """ cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(len(X)) valid_indices = np.arange(len(X)) early_stopping_trainer(cost_function, cost_function, train_indices, valid_indices, checkpoint_dict, [X, y], minibatch_size, list_of_minibatch_functions=[text_minibatch_func], list_of_train_output_names=["cost"], valid_output_name="valid_cost", n_epochs=1)
def test_conditional_gru_recurrent(): random_state = np.random.RandomState(1999) graph = OrderedDict() n_hid = 5 n_out = n_chars # input (where first dimension is time) datasets_list = [X_mb, X_mask, y_mb, y_mask] names_list = ["X", "X_mask", "y", "y_mask"] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph) h = gru_recurrent_layer([X_sym], X_mask_sym, n_hid, graph, 'l1_end', random_state) shifted_y_sym = shift_layer([y_sym], graph, 'shift') h_dec, context = conditional_gru_recurrent_layer([y_sym], [h], y_mask_sym, n_hid, graph, 'l2_dec', random_state) # linear output activation y_hat = softmax_layer([h_dec, context, shifted_y_sym], graph, 'l2_proj', n_out, random_state) # error between output and target cost = categorical_crossentropy(y_hat, y_sym) cost = masked_cost(cost, y_mask_sym).mean() # Parameters of the model """ params, grads = get_params_and_grads(graph, cost) # Use stochastic gradient descent to optimize opt = sgd(params) learning_rate = 0.00000 updates = opt.updates(params, grads, learning_rate) fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates, mode="FAST_COMPILE") """ cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(len(X)) valid_indices = np.arange(len(X)) early_stopping_trainer(cost_function, cost_function, checkpoint_dict, [X, y], minibatch_size, train_indices, valid_indices, list_of_minibatch_functions=[text_minibatch_func], fit_function_output_names=["cost"], cost_function_output_name="valid_cost", n_epochs=1)
def test_softmax_sample_layer(): random_state = np.random.RandomState(42) graph = OrderedDict() X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph) softmax = softmax_layer([X_sym], graph, 'softmax', proj_dim=20, random_state=random_state) samp = softmax_sample_layer([softmax], graph, 'softmax_sample', random_state=random_state) out = linear_layer([samp], graph, 'out', proj_dim=10, random_state=random_state) f = theano.function([X_sym], [out], mode="FAST_COMPILE")
def test_softmax_zeros_layer(): graph = OrderedDict() X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph) single_o = softmax_zeros_layer([X_sym], graph, 'single', proj_dim=5) concat_o = softmax_zeros_layer([X_sym, y_sym], graph, 'concat', proj_dim=5) # Check that things can be reused repeated_o = softmax_layer([X_sym], graph, 'single', strict=False) # Check that strict mode raises an error if repeated assert_raises(AttributeError, softmax_layer, [X_sym], graph, 'concat') f = theano.function([X_sym, y_sym], [single_o, concat_o, repeated_o], mode="FAST_COMPILE") single, concat, repeat = f(X, y) assert_almost_equal(single, repeat)
def test_feedforward_classifier(): minibatch_size = 100 random_state = np.random.RandomState(1999) graph = OrderedDict() X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph) l1_o = linear_layer([X_sym], graph, "l1", proj_dim=20, random_state=random_state) y_pred = softmax_layer([l1_o], graph, "pred", n_classes, random_state=random_state) cost = categorical_crossentropy(y_pred, y_sym).mean() params, grads = get_params_and_grads(graph, cost) learning_rate = 0.001 opt = sgd(params) updates = opt.updates(params, grads, learning_rate) train_function = theano.function([X_sym, y_sym], [cost], updates=updates, mode="FAST_COMPILE") iterate_function(train_function, [X, y], minibatch_size, list_of_output_names=["cost"], n_epochs=1)
vocab = data["vocabulary"] train_indices = data["train_indices"] valid_indices = train_indices X_mb, X_mb_mask = make_masked_minibatch(X, slice(0, len(X))) y_mb, y_mb_mask = make_masked_minibatch(y, slice(0, len(y))) n_hid = 256 n_out = vocab_size + 1 datasets_list = [X_mb, X_mb_mask, y_mb, y_mb_mask] names_list = ["X", "X_mask", "y", "y_mask"] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph(datasets_list, names_list, graph) h = gru_recurrent_layer([X_sym], X_mask_sym, n_hid, graph, "l1_rec", random_state=random_state) y_pred = softmax_layer([h], graph, "l2_proj", n_out, random_state=random_state) cost = log_ctc_cost(y_sym, y_mask_sym, y_pred, X_mask_sym).mean() params, grads = get_params_and_grads(graph, cost) opt = adadelta(params) updates = opt.updates(params, grads) checkpoint_dict = {} fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates) cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost]) predict_function = theano.function([X_sym, X_mask_sym], [y_pred]) def prediction_strings(y_pred):
X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph) # random state so script is deterministic random_state = np.random.RandomState(1999) minibatch_size = 100 n_code = 100 n_enc_layer = [200, 200, 200] n_dec_layer = [200, 200] width = 48 height = 48 n_input = width * height # q(y_pred | x) y_l1_enc = softplus_layer([X_sym], graph, 'y_l1_enc', n_enc_layer[0], random_state) y_pred = softmax_layer([y_l1_enc], graph, 'y_pred', n_targets, random_state) # partial q(z | x, y_pred) X_l1_enc = softplus_layer([X_sym, y_pred], graph, 'X_l1_enc', n_enc_layer[1], random_state) # combined q(y_pred | x) and partial q(z | x) for q(z | x, y_pred) l2_enc = softplus_layer([X_l1_enc], graph, 'l2_enc', n_enc_layer[2], random_state) # code layer code_mu = linear_layer([l2_enc], graph, 'code_mu', n_code, random_state) code_log_sigma = linear_layer([l2_enc], graph, 'code_log_sigma', n_code, random_state) kl = gaussian_log_kl([code_mu], [code_log_sigma], graph, 'kl').mean() samp = gaussian_log_sample_layer([code_mu], [code_log_sigma], graph, 'samp',
datasets_list = [X_mb, X_mb_mask, y_mb, y_mb_mask] names_list = ["X", "X_mask", "y", "y_mask"] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph) l1 = maxout_layer([X_sym], graph, 'l1', n_hid, random_state=random_state) h = bidirectional_gru_recurrent_layer([l1], X_mask_sym, n_hid, graph, 'l1_rec', random_state=random_state) l2 = maxout_layer([h], graph, 'l2', n_hid, random_state=random_state) y_pred = softmax_layer([l2], graph, 'softmax', n_out, random_state=random_state) cost = log_ctc_cost(y_sym, y_mask_sym, y_pred, X_mask_sym).mean() params, grads = get_params_and_grads(graph, cost) opt = adadelta(params) updates = opt.updates(params, grads) checkpoint_dict = {} fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates) cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost]) predict_function = theano.function([X_sym, X_mask_sym], [y_pred])
X_mb, X_mb_mask = make_masked_minibatch(X, slice(0, minibatch_size)) y_mb, y_mb_mask = make_masked_minibatch(y, slice(0, minibatch_size)) n_hid = 500 n_out = vocab_size + 1 datasets_list = [X_mb, X_mb_mask, y_mb, y_mb_mask] names_list = ["X", "X_mask", "y", "y_mask"] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph) l1 = maxout_layer([X_sym], graph, 'l1', n_hid, random_state=random_state) h = bidirectional_gru_recurrent_layer([l1], X_mask_sym, n_hid, graph, 'l1_rec', random_state=random_state) l2 = maxout_layer([h], graph, 'l2', n_hid, random_state=random_state) y_pred = softmax_layer([l2], graph, 'softmax', n_out, random_state=random_state) cost = log_ctc_cost(y_sym, y_mask_sym, y_pred, X_mask_sym).mean() params, grads = get_params_and_grads(graph, cost) opt = adadelta(params) updates = opt.updates(params, grads) checkpoint_dict = {} fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates) cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost]) predict_function = theano.function([X_sym, X_mask_sym], [y_pred])
random_state = np.random.RandomState(1999) minibatch_size = 100 n_code = 100 n_enc_layer = 200 n_dec_layer = 200 width = 28 height = 28 n_input = width * height # q(y_pred | x) y_l1_enc = softplus_layer([X_sym], graph, 'y_l1_enc', n_enc_layer, random_state=random_state) y_l2_enc = softplus_layer([y_l1_enc], graph, 'y_l2_enc', n_targets, random_state=random_state) y_pred = softmax_layer([y_l2_enc], graph, 'y_pred', n_targets, random_state=random_state) # partial q(z | x) X_l1_enc = softplus_layer([X_sym], graph, 'X_l1_enc', n_enc_layer, random_state=random_state) X_l2_enc = softplus_layer([X_l1_enc], graph, 'X_l2_enc', n_enc_layer, random_state=random_state) # combined q(y_pred | x) and partial q(z | x) for q(z | x, y_pred) l3_enc = softplus_layer([X_l2_enc, y_pred], graph, 'l3_enc', n_enc_layer, random_state=random_state) l4_enc = softplus_layer([l3_enc], graph, 'l4_enc', n_enc_layer, random_state=random_state) # code layer
n_hid = 256 n_out = vocab_size + 1 datasets_list = [X_mb, X_mb_mask, y_mb, y_mb_mask] names_list = ["X", "X_mask", "y", "y_mask"] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph) h = gru_recurrent_layer([X_sym], X_mask_sym, n_hid, graph, 'l1_rec', random_state=random_state) y_pred = softmax_layer([h], graph, 'l2_proj', n_out, random_state=random_state) cost = log_ctc_cost(y_sym, y_mask_sym, y_pred, X_mask_sym).mean() params, grads = get_params_and_grads(graph, cost) opt = adadelta(params) updates = opt.updates(params, grads) checkpoint_dict = {} fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates) cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost]) predict_function = theano.function([X_sym, X_mask_sym], [y_pred])
n_targets = 10 y = convert_to_one_hot(y, n_targets) # graph holds information necessary to build layers from parents graph = OrderedDict() X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph) # random state so script is deterministic random_state = np.random.RandomState(1999) minibatch_size = 100 n_hid = 512 # q(y_pred | x) l1 = relu_layer([X_sym], graph, 'l1', n_hid, random_state) l2 = relu_layer([l1], graph, 'l2', n_hid, random_state) y_pred = softmax_layer([l2], graph, 'y_pred', n_targets, random_state) nll = categorical_crossentropy(y_pred, y_sym).mean() cost = nll params, grads = get_params_and_grads(graph, cost) learning_rate = 0.0002 opt = adam(params) updates = opt.updates(params, grads, learning_rate) # Checkpointing try: checkpoint_dict = load_last_checkpoint() fit_function = checkpoint_dict["fit_function"] cost_function = checkpoint_dict["cost_function"] predict_function = checkpoint_dict["predict_function"]
(X_story_syms, X_query_syms), (X_story_mask_sym, X_query_mask_sym) = r y_sym = add_datasets_to_graph([y_answer], ["y"], graph) l1_story = embedding_layer(X_story_syms, vocab_size, n_emb, graph, 'l1_story', random_state=random_state) masked_story = X_story_mask_sym.dimshuffle(0, 1, 'x') * l1_story h_story = gru_recurrent_layer([masked_story], X_story_mask_sym, n_hid, graph, 'story_rec', random_state) l1_query = embedding_layer(X_query_syms, vocab_size, n_emb, graph, 'l1_query', random_state) h_query = gru_recurrent_layer([l1_query], X_query_mask_sym, n_hid, graph, 'query_rec', random_state) y_pred = softmax_layer([h_query[-1], h_story[-1]], graph, 'y_pred', y_answer.shape[1], random_state=random_state) cost = categorical_crossentropy(y_pred, y_sym).mean() params, grads = get_params_and_grads(graph, cost) opt = adadelta(params) updates = opt.updates(params, grads) print("Compiling fit...") fit_function = theano.function(X_story_syms + [X_story_mask_sym] + X_query_syms + [X_query_mask_sym, y_sym], [cost], updates=updates) print("Compiling cost...") cost_function = theano.function(X_story_syms + [X_story_mask_sym] + X_query_syms + [X_query_mask_sym, y_sym], [cost]) print("Compiling predict...") predict_function = theano.function(X_story_syms + [X_story_mask_sym] + X_query_syms + [X_query_mask_sym], [y_pred])
# graph holds information necessary to build layers from parents graph = OrderedDict() X_sym, y_sym = add_datasets_to_graph([X[:minibatch_size], y[:minibatch_size]], ["X", "y"], graph) # random state so script is deterministic random_state = np.random.RandomState(1999) l1 = conv2d_layer([X_sym], graph, 'conv1', 8, random_state=random_state) l2 = pool2d_layer([l1], graph, 'pool1') l3 = conv2d_layer([l2], graph, 'conv2', 16, random_state=random_state) l4 = pool2d_layer([l3], graph, 'pool2') l5 = l4.reshape((l4.shape[0], -1)) y_pred = softmax_layer([l5], graph, 'y_pred', n_targets, random_state=random_state) nll = categorical_crossentropy(y_pred, y_sym).mean() cost = nll params, grads = get_params_and_grads(graph, cost) learning_rate = 0.001 momentum = 0.9 opt = sgd_nesterov(params, learning_rate, momentum) updates = opt.updates(params, grads) fit_function = theano.function([X_sym, y_sym], [cost], updates=updates) cost_function = theano.function([X_sym, y_sym], [cost]) predict_function = theano.function([X_sym], [y_pred])
(X_story_syms, X_query_syms), (X_story_mask_sym, X_query_mask_sym) = r y_sym = add_datasets_to_graph([y_answer], ["y"], graph) l1_story = embedding_layer(X_story_syms, vocab_size, n_emb, graph, 'l1_story', random_state) masked_story = X_story_mask_sym.dimshuffle(0, 1, 'x') * l1_story h_story = gru_recurrent_layer([masked_story], X_story_mask_sym, n_hid, graph, 'story_rec', random_state) l1_query = embedding_layer(X_query_syms, vocab_size, n_emb, graph, 'l1_query', random_state) h_query = gru_recurrent_layer([l1_query], X_query_mask_sym, n_hid, graph, 'query_rec', random_state) y_pred = softmax_layer([h_query[-1], h_story[-1]], graph, 'y_pred', y_answer.shape[1], random_state) cost = categorical_crossentropy(y_pred, y_sym).mean() params, grads = get_params_and_grads(graph, cost) opt = adam(params) learning_rate = 0.001 updates = opt.updates(params, grads, learning_rate) print("Compiling fit...") fit_function = theano.function(X_story_syms + [X_story_mask_sym] + X_query_syms + [X_query_mask_sym, y_sym], [cost], updates=updates) print("Compiling cost...") cost_function = theano.function(X_story_syms + [X_story_mask_sym] + X_query_syms + [X_query_mask_sym, y_sym], [cost]) print("Compiling predict...") predict_function = theano.function(X_story_syms + [X_story_mask_sym] +