parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) train_set = H5PYDataset('mushrooms.hdf5', which_sets=('train',)) train_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme( train_set.num_examples, batch_size=128)) test_set = H5PYDataset('mushrooms.hdf5', which_sets=('test',)) test_stream = DataStream.default_stream( test_set, iteration_scheme=SequentialScheme( test_set.num_examples, batch_size=128)) main = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ FinishAfter(after_n_epochs=10), Printing(), TrainingDataMonitoring([cost, error_rate], after_batch=True, prefix='train'), DataStreamMonitoring([cost, error_rate], after_batch=True, data_stream=test_stream, prefix='test'), Plot('Train', channels=[['train_cost', 'test_cost'], ['train_error_rate', 'test_error_rate']]) ]) main.run() hinton(W1.get_value()) hinton(W2.get_value())
#定义一个多层神经网络,层与层之间的计算公式已经被之前定义。 #激活函数集activations定义了每一层的非线性转换函数,多层感知器每一层的输出都包含了两部分,第一部分是线性计算,然后将线性计算的结果进行非线性转换 #x是多层感知器的输入 mlp = MLP(activations = [Rectifier(),Softmax()], dims = [784, 100, 10]).apply(x) #定义完整个神经网络的流程后,需要设置其线性转换的参数的初始值。 input_to_hidden.weights_init = IsotropicGaussian(0.01) input_to_hidden.biases_init = Constant(0); hidden_to_output.weights_init = IsotropicGaussian(0.01) hidden_to_output.biases_init = Constant(0) #对设置进行初始化设置,必须要做这一步,否则之前的设置都没有用 input_to_hidden.initialize() hidden_to_output.initialize() print W1.get_value() #然后开始进行模型训练,这里使用现有的内置的数据集 MNIST,如果想要使用别的数据集,需要使用fuel对数据进行预处理 mnist = MNIST(("train",)) #定义迭代计算的方式,使用mini-batch的方法计算,每一次mini-batch使用1024条数据。以此获得数据流,data_stream data_stream = Flatten(DataStream.default_stream(mnist, iteration_scheme = SequentialScheme(mnist.num_examples, batch_size = 256))) #定义优化函数的最优值计算方法,这边使用SGD来做 #algorithm = GradientDescent(cost = cost, parameters = [cg.parameters], step_rule = Scale(learning_rate = 0.01)) algorithm = GradientDescent(cost = cost, parameters = cg.parameters, step_rule = Scale(learning_rate = 0.01)) # define to use gradient dscent to compute print "------" #对训练数据的指定参数进行监控,使用DataStreamMonitoring方法,使用test 集合来验证结果。在训练过程中,可以查看算法在test集合的性能表现。 mnist_test = MNIST(("test",)) #data_stream_test = Flatten(DataStream.default_stream(mnist_test, iteration_scheme = SequentialScheme(mnist_test.num_examples, batch_size= 1024)))
batch_size=128)) test_set = H5PYDataset('mushrooms.hdf5', which_sets=('test', )) test_stream = DataStream.default_stream(test_set, iteration_scheme=SequentialScheme( test_set.num_examples, batch_size=128)) main = MainLoop(model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ FinishAfter(after_n_epochs=10), Printing(), TrainingDataMonitoring([cost, error_rate], after_batch=True, prefix='train'), DataStreamMonitoring([cost, error_rate], after_batch=True, data_stream=test_stream, prefix='test'), Plot('Train', channels=[['train_cost', 'test_cost'], ['train_error_rate', 'test_error_rate']]) ]) main.run() hinton(W1.get_value()) hinton(W2.get_value())
def run_experiment(): np.random.seed(42) X = tensor.tensor4('features') nbr_channels = 3 image_shape = (5, 5) conv_layers = [ ConvolutionalLayer( filter_size=(2,2), num_filters=10, activation=Rectifier().apply, border_mode='valid', pooling_size=(1,1), weights_init=Uniform(width=0.1), #biases_init=Uniform(width=0.01), biases_init=Constant(0.0), name='conv0')] conv_sequence = ConvolutionalSequence( conv_layers, num_channels=nbr_channels, image_size=image_shape) #conv_sequence.push_allocation_config() conv_sequence.initialize() flattener = Flattener() conv_output = conv_sequence.apply(X) y_hat = flattener.apply(conv_output) # Whatever. Not important since we're not going to actually train anything. cost = tensor.sqr(y_hat).sum() #L_grads_method_02 = [tensor.grad(cost, v) for v in VariableFilter(roles=[FILTER, BIAS])(ComputationGraph([y_hat]).variables)] L_grads_method_02 = [tensor.grad(cost, v) for v in VariableFilter(roles=[BIAS])(ComputationGraph([y_hat]).variables)] # works on the sum of the gradients in a mini-batch sum_square_norm_gradients_method_02 = sum([tensor.sqr(g).sum() for g in L_grads_method_02]) D_by_layer = get_conv_layers_transformation_roles(ComputationGraph(conv_output)) individual_sum_square_norm_gradients_method_00 = get_sum_square_norm_gradients_conv_transformations(D_by_layer, cost) # why does this thing depend on N again ? # I don't think I've used a cost that divides by N. N = 2 Xtrain = np.random.randn(N, nbr_channels, image_shape[0], image_shape[1]).astype(np.float32) #Xtrain[1:,:,:,:] = 0.0 Xtrain[:,:,:,:] = 1.0 convolution_filter_variable = VariableFilter(roles=[FILTER])(ComputationGraph([y_hat]).variables)[0] convolution_filter_variable_value = convolution_filter_variable.get_value() convolution_filter_variable_value[:,:,:,:] = 1.0 #convolution_filter_variable_value[0,0,:,:] = 1.0 convolution_filter_variable.set_value(convolution_filter_variable_value) f = theano.function([X], [cost, individual_sum_square_norm_gradients_method_00, sum_square_norm_gradients_method_02]) [c, v0, gs2] = f(Xtrain) #print "[c, v0, gs2]" L_c, L_v0, L_gs2 = ([], [], []) for n in range(N): [nc, nv0, ngs2] = f(Xtrain[n,:, :, :].reshape((1, Xtrain.shape[1], Xtrain.shape[2], Xtrain.shape[3]))) L_c.append(nc) L_v0.append(nv0) L_gs2.append(ngs2) print "Cost for whole mini-batch in single shot : %f." % c print "Cost for whole mini-batch accumulated : %f." % sum(L_c) print "" print "Square-norm of all gradients for each data point in single shot :" print v0.reshape((1,-1)) print "Square-norm of all gradients for each data point iteratively :" print np.array(L_gs2).reshape((1,-1)) print "" print "Difference max abs : %f." % np.max(np.abs(v0 - np.array(L_gs2))) print "" print "Ratios : " print np.array(L_gs2).reshape((1,-1)) / v0.reshape((1,-1))
def run_experiment(): np.random.seed(42) X = tensor.tensor4('features') nbr_channels = 3 image_shape = (5, 5) conv_layers = [ ConvolutionalLayer( filter_size=(2, 2), num_filters=10, activation=Rectifier().apply, border_mode='valid', pooling_size=(1, 1), weights_init=Uniform(width=0.1), #biases_init=Uniform(width=0.01), biases_init=Constant(0.0), name='conv0') ] conv_sequence = ConvolutionalSequence(conv_layers, num_channels=nbr_channels, image_size=image_shape) #conv_sequence.push_allocation_config() conv_sequence.initialize() flattener = Flattener() conv_output = conv_sequence.apply(X) y_hat = flattener.apply(conv_output) # Whatever. Not important since we're not going to actually train anything. cost = tensor.sqr(y_hat).sum() #L_grads_method_02 = [tensor.grad(cost, v) for v in VariableFilter(roles=[FILTER, BIAS])(ComputationGraph([y_hat]).variables)] L_grads_method_02 = [ tensor.grad(cost, v) for v in VariableFilter( roles=[BIAS])(ComputationGraph([y_hat]).variables) ] # works on the sum of the gradients in a mini-batch sum_square_norm_gradients_method_02 = sum( [tensor.sqr(g).sum() for g in L_grads_method_02]) D_by_layer = get_conv_layers_transformation_roles( ComputationGraph(conv_output)) individual_sum_square_norm_gradients_method_00 = get_sum_square_norm_gradients_conv_transformations( D_by_layer, cost) # why does this thing depend on N again ? # I don't think I've used a cost that divides by N. N = 2 Xtrain = np.random.randn(N, nbr_channels, image_shape[0], image_shape[1]).astype(np.float32) #Xtrain[1:,:,:,:] = 0.0 Xtrain[:, :, :, :] = 1.0 convolution_filter_variable = VariableFilter(roles=[FILTER])( ComputationGraph([y_hat]).variables)[0] convolution_filter_variable_value = convolution_filter_variable.get_value() convolution_filter_variable_value[:, :, :, :] = 1.0 #convolution_filter_variable_value[0,0,:,:] = 1.0 convolution_filter_variable.set_value(convolution_filter_variable_value) f = theano.function([X], [ cost, individual_sum_square_norm_gradients_method_00, sum_square_norm_gradients_method_02 ]) [c, v0, gs2] = f(Xtrain) #print "[c, v0, gs2]" L_c, L_v0, L_gs2 = ([], [], []) for n in range(N): [nc, nv0, ngs2] = f(Xtrain[n, :, :, :].reshape( (1, Xtrain.shape[1], Xtrain.shape[2], Xtrain.shape[3]))) L_c.append(nc) L_v0.append(nv0) L_gs2.append(ngs2) print "Cost for whole mini-batch in single shot : %f." % c print "Cost for whole mini-batch accumulated : %f." % sum(L_c) print "" print "Square-norm of all gradients for each data point in single shot :" print v0.reshape((1, -1)) print "Square-norm of all gradients for each data point iteratively :" print np.array(L_gs2).reshape((1, -1)) print "" print "Difference max abs : %f." % np.max(np.abs(v0 - np.array(L_gs2))) print "" print "Ratios : " print np.array(L_gs2).reshape((1, -1)) / v0.reshape((1, -1))
def train(train_set, test_set): x = tensor.matrix('features') y = tensor.lmatrix('targets') l1 = Linear( name='input_to_hidden', input_dim=2, output_dim=3, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0) ) l1.initialize() h = Logistic().apply(l1.apply(x)) l2 = Linear( name='hidden_to_output', input_dim=l1.output_dim, output_dim=2, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0) ) l2.initialize() y_hat = Softmax().apply(l2.apply(h)) cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'misclassification_rate' cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 1e-8 * (W1 ** 2).sum() + 1e-8 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' print('W1', W1.get_value()) print('W2', W2.get_value()) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=RMSProp() ) data_stream_train = Flatten( DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, batch_size=4) ) ) data_stream_test = Flatten( DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size=1) ) ) monitor = DataStreamMonitoring( variables=[cost, error], data_stream=data_stream_test, prefix="test" ) main_loop = MainLoop( data_stream=data_stream_train, algorithm=algorithm, extensions=[ monitor, FinishAfter(after_n_epochs=100), Printing(), # ProgressBar() ] ) main_loop.run()