softmax_out = softmax_out.reshape(shape) softmax_out.name = 'softmax_out' # comparing only last time-step cost = CategoricalCrossEntropy().apply(y[-1, :, 0], softmax_out[-1]) cost.name = 'CrossEntropy' error_rate = MisclassificationRate().apply(y[-1, :, 0], softmax_out[-1]) error_rate.name = 'error_rate' # Initialization for brick in (x_to_h1, h1_to_o): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0) brick.initialize() rnn.weights_init = Identity() rnn.biases_init = Constant(0) rnn.initialize() print 'Bulding training process...' algorithm = GradientDescent(cost=cost, parameters=ComputationGraph(cost).parameters, step_rule=learning_algorithm( learning_rate=1e-6, momentum=0.0, clipping_threshold=1.0, algorithm='adam')) train_stream, valid_stream = MNIST(batch_size=batch_size) monitor_train_cost = TrainingDataMonitoring([cost, error_rate], prefix="train",
#lstm = GatedRecurrent(dim=h_dim, # activation=Tanh()) decode = Linear(name='decode', input_dim=h_dim, output_dim=1) for brick in (encode, gates, decode): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.) brick.initialize() lstm.weights_init = IsotropicGaussian(0.01) #lstm.weights_init = Orthogonal() lstm.biases_init = Constant(0.) lstm.initialize() #ComputationGraph(encode.apply(x)).get_theano_function()(features_test)[0].shape #ComputationGraph(lstm.apply(encoded)).get_theano_function()(features_test) #ComputationGraph(decode.apply(hiddens[-1])).get_theano_function()(features_test)[0].shape #ComputationGraph(SquaredError().apply(y, y_hat.flatten())).get_theano_function()(features_test, targets_test)[0].shape encoded = encode.apply(x) #hiddens = lstm.apply(encoded, gates.apply(x)) hiddens = lstm.apply(encoded) y_hat = decode.apply(hiddens[-1]) cost = SquaredError().apply(y, y_hat) cost.name = 'cost'
softmax_out = softmax_out.reshape(shape) softmax_out.name = 'softmax_out' # comparing only last time-step cost = CategoricalCrossEntropy().apply(y[-1, :, 0], softmax_out[-1]) cost.name = 'CrossEntropy' error_rate = MisclassificationRate().apply(y[-1, :, 0], softmax_out[-1]) error_rate.name = 'error_rate' # Initialization for brick in (x_to_h1, h1_to_o): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0) brick.initialize() rnn.weights_init = Identity() rnn.biases_init = Constant(0) rnn.initialize() print 'Bulding training process...' algorithm = GradientDescent( cost=cost, parameters=ComputationGraph(cost).parameters, step_rule=learning_algorithm(learning_rate=1e-6, momentum=0.0, clipping_threshold=1.0, algorithm='adam')) cg = ComputationGraph(cost) params_to_sync = {} #cg.variables counter = 0 print "---- cg.parameters ----"
lstm = SimpleRecurrent(dim=h_dim, activation=Tanh()) #lstm = GatedRecurrent(dim=h_dim, # activation=Tanh()) decode = Linear(name='decode', input_dim=h_dim, output_dim=1) for brick in (encode, gates, decode): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.) brick.initialize() lstm.weights_init = IsotropicGaussian(0.01) #lstm.weights_init = Orthogonal() lstm.biases_init = Constant(0.) lstm.initialize() #ComputationGraph(encode.apply(x)).get_theano_function()(features_test)[0].shape #ComputationGraph(lstm.apply(encoded)).get_theano_function()(features_test) #ComputationGraph(decode.apply(hiddens[-1])).get_theano_function()(features_test)[0].shape #ComputationGraph(SquaredError().apply(y, y_hat.flatten())).get_theano_function()(features_test, targets_test)[0].shape encoded = encode.apply(x) #hiddens = lstm.apply(encoded, gates.apply(x)) hiddens = lstm.apply(encoded) y_hat = decode.apply(hiddens[-1]) cost = SquaredError().apply(y, y_hat) cost.name = 'cost'