def test_sgd(self): model_list = [dict(type='Linear', in_dim=128, out_dim=10)] criterion = dict(type='SoftmaxCrossEntropy') model = ConvNet(model_list, criterion) optimizer = SGD(model) # forward once np.random.seed(1024) x = np.random.randn(32, 128) np.random.seed(1024) y = np.random.randint(10, size=32) tmp = model.forward(x, y) model.backward() optimizer.update(model) # forward twice np.random.seed(512) x = np.random.randn(32, 128) np.random.seed(512) y = np.random.randint(10, size=32) tmp = model.forward(x, y) model.backward() optimizer.update(model) expected_weights = np.load('tests/sgd_weights/w.npy') expected_bias = np.load('tests/sgd_weights/b.npy') self.assertAlmostEquals( np.sum(np.abs(expected_weights - model.modules[0].weight)), 0) self.assertAlmostEquals( np.sum(np.abs(expected_bias - model.modules[0].bias)), 0)
def main(): batch_size = 10 wordvec_size = 100 hidden_size = 100 time_size = 5 lr = 0.1 max_epoch = 100 corpus, word_to_id, id_to_word = ptb.load_data('train') corpus_size = 1000 corpus = corpus[:1000] vocab_size = int(max(corpus) + 1) xs = corpus[:-1] ts = corpus[1:] data_size = len(xs) print(f'corpus size: {corpus_size}, vocabulary size: {vocab_size}') max_iters = data_size // (batch_size + time_size) time_idx = 0 total_loss = 0 loss_count = 0 ppl_list = [] model = SimpleRnnlm(vocab_size, wordvec_size, hidden_size) optimizer = SGD(lr) jump = (corpus_size - 1) // batch_size offsets = [i * jump for i in range(batch_size)] for epoch in range(1, max_epoch + 1): for iter_ in range(max_iters): batch_x = np.empty((batch_size, time_size), dtype=int) batch_t = np.empty((batch_size, time_size), dtype=int) for t in range(time_size): for i, offset in enumerate(offsets): batch_x[i, t] = xs[(offset + time_idx) % data_size] batch_t[i, t] = xs[(offset + time_idx) % data_size] time_idx += 1 loss = model.forward(batch_x, batch_t) model.backward() optimizer.update(model.params, model.grads) total_loss += loss loss_count += 1 ppl = np.exp(total_loss / loss_count) print(f'| epoch {epoch} | perplexity {ppl}') ppl_list.append(float(ppl)) total_loss, loss_count = 0, 0 print('DONE')
def test_update(self): sgd = SGD() gnn = GraphNeuralNetwork(vector_size=2) expected = gnn.params sgd.update(gnn) actual = gnn.params self.assertEqual(expected, actual) params = copy.deepcopy(gnn.params) for _ in range(100): gnn.grads["W"] = np.random.rand() gnn.grads["A"] = np.random.rand() gnn.grads["b"] = np.random.rand() sgd.update(gnn) for key, param in params.items(): params[key] = param - gnn.grads[key] * sgd.lr expected = repr(params[key]) actual = repr(gnn.params[key]) self.assertEqual(expected, actual)
def train(weight_init_std, x_train, t_train, max_epochs): batch_norm_network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100], output_size=10, weight_init_std=weight_init_std, use_batchnorm=True) no_batch_norm_network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100], output_size=10, weight_init_std=weight_init_std) train_size = x_train.shape[0] batch_size = 100 learning_rate = 0.01 max_iters_times = 1000000000 epoch = max(int(train_size / batch_size), 1) optimizer = SGD(lr=learning_rate) bn_train_acc_list = [] no_bn_train_acc_list = [] epoch_cnt = 0 for i in range(max_iters_times): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] t_batch = t_train[batch_mask] for network in (batch_norm_network, no_batch_norm_network): grads = network.gradient(x_batch, t_batch) optimizer.update(network.params, grads) if i % epoch == 0: bn_train_acc = batch_norm_network.accuracy(x_train, t_train) no_bn_train_acc = no_batch_norm_network.accuracy(x_train, t_train) bn_train_acc_list.append(bn_train_acc) no_bn_train_acc_list.append(no_bn_train_acc) print("epoch:" + str(epoch_cnt) + " | " + str(no_bn_train_acc) + " - " + str(bn_train_acc)) epoch_cnt += 1 if epoch_cnt >= max_epochs: break return no_bn_train_acc_list, bn_train_acc_list
def __train(weight_init_std): bn_network = MultiLayerNetExtend( input_size=784, hidden_size_list=[100, 100, 100, 100, 100], output_size=10, weight_init_std=weight_init_std, use_batchnorm=True) network = MultiLayerNetExtend(input_size=784, hidden_size_list=[100, 100, 100, 100, 100], output_size=10, weight_init_std=weight_init_std) optimizer = SGD(lr=learning_rate) train_acc_list = [] bn_train_acc_list = [] iter_per_epoch = max(train_size / batch_size, 1) epoch_cnt = 0 for i in range(1000000000): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] t_batch = t_train[batch_mask] for _network in (bn_network, network): grads = _network.gradient(x_batch, t_batch) optimizer.update(_network.params, grads) if i % iter_per_epoch == 0: train_acc = network.accuracy(x_train, t_train) bn_train_acc = bn_network.accuracy(x_train, t_train) train_acc_list.append(train_acc) bn_train_acc_list.append(bn_train_acc) print("epoch:" + str(epoch_cnt) + " | " + str(train_acc) + " - " + str(bn_train_acc)) epoch_cnt += 1 if epoch_cnt >= max_epochs: break return train_acc_list, bn_train_acc_list
optimizer = SGD(lr=learning_rate) data_size = len(x) max_iters = data_size // batch_size total_loss = 0 loss_count = 0 loss_list = [] for epoch in range(max_epoch): idx = np.random.permutation(data_size) x = x[idx] t = t[idx] for iters in range(max_iters): batch_x = x[iters * batch_size:(iters + 1) * batch_size] batch_t = t[iters * batch_size:(iters + 1) * batch_size] loss = model.forward(batch_x, batch_t) model.backward() optimizer.update(model.params, model.grads) total_loss += loss loss_count += 1 if (iters + 1) % 10 == 0: avg_loss = total_loss / loss_count print('| epoch %d | iter %d/%d | loss %.2f ' % (epoch + 1, iters + 1, max_iters, avg_loss)) loss_list.append(avg_loss) total_loss, loss_count = 0, 0
for key, weight_type in weight_init_types.items(): networks[key] = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100], output_size=10, weight_init_std=weight_type) train_loss[key] = [] # 2:开始训练========== for i in range(max_iterations): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] t_batch = t_train[batch_mask] for key in weight_init_types.keys(): grads = networks[key].gradient(x_batch, t_batch) optimizer.update(networks[key].params, grads) loss = networks[key].loss(x_batch, t_batch) train_loss[key].append(loss) if i % 100 == 0: print("===========" + "iteration:" + str(i) + "===========") for key in weight_init_types.keys(): loss = networks[key].loss(x_batch, t_batch) print(key + ":" + str(loss)) # 3.绘制图形========== markers = {'std=0.01': 'o', 'Xavier': 's', 'He': 'D'} x = np.arange(max_iterations) for key in weight_init_types.keys(): plt.plot(x,
train_acc_list = [] test_acc_list = [] iters_num = 10000 batch_size = 100 train_size = x_train.shape[0] iter_per_epoch = max(train_size / batch_size, 1) net = ConvNet() optim = SGD(net.params, lr=0.1, momentum=0.9) # optim = AdaGrad(net.params) # optim = Adam(net.params) for i in range(iters_num): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] t_batch = t_train[batch_mask] grad = net.gradient(x_batch, t_batch) net.params = optim.update(net.params, grad) loss = net.loss(x_batch, t_batch) train_loss_list.append(loss) if i % iter_per_epoch == 0: train_acc = net.accuracy(x_train, t_train) test_acc = net.accuracy(x_test, t_test) train_acc_list.append(train_acc) test_acc_list.append(test_acc) print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))
batch_size = 100 train_loss_list = [] train_acc_list = [] test_acc_list = [] iter_per_epoch = max(train_size / batch_size, 1) epoch_cnt = 0 for i in range(1000000000): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] t_batch = t_train[batch_mask] grads = network.gradient(x_batch, t_batch) optimizer.update(network.params, grads) if i % iter_per_epoch == 0: train_acc = network.accuracy(x_train, t_train) test_acc = network.accuracy(x_test, t_test) train_acc_list.append(train_acc) test_acc_list.append(test_acc) print("epoch:" + str(epoch_cnt) + ", train acc:" + str(train_acc) + ", test acc:" + str(test_acc)) epoch_cnt += 1 if epoch_cnt >= max_epochs: break # 그래프 그리기==========
def fit(self): n_batches = math.ceil(len(self.x_train) / self.batch_size) gpuConfig = tf.compat.v1.ConfigProto( gpu_options=tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=self.per_process_gpu_memory_fraction), device_count={'GPU': 1} ) with tf.compat.v1.Session(config=gpuConfig) as self.sess: optimizer = SGD(self.cost, self.params, self.learning_rate) all_grad = optimizer.get_grad() hessian = optimizer.get_hessian(layer_num=-2) updates = optimizer.update() train = tf.group(*updates) self.sess.run(tf.compat.v1.global_variables_initializer()) costs_train, accs_train, norms_train, hesse_matrixes_train = [], [], [], [] costs_valid, accs_valid, norms_valid, hesse_matrixes_valid = [], [], [], [] for self.epoch in range(self.epoch_size): self.x_train, self.t_train = shuffle(self.x_train, self.t_train) train_feed_dict={self.x: self.x_train[:20000], self.t: self.t_train[:20000], self.is_training: False} valid_feed_dict={self.x: self.x_valid, self.t: self.t_valid, self.is_training: False} for i in tqdm(range(n_batches)): start = i * self.batch_size end = start + self.batch_size self.sess.run(train, feed_dict={self.x: self.x_train[start:end], self.t: self.t_train[start:end], self.is_training: True}) norm_train = self.extract_gradient_norm(all_grad, 'Train', train_feed_dict) norms_train.append(norm_train) norm_valid = self.extract_gradient_norm(all_grad, 'Valid', valid_feed_dict) norms_valid.append(norm_valid) cost_train, acc_train = self.extract_cost_and_acc('Train', train_feed_dict) costs_train.append(cost_train) accs_train.append(acc_train) cost_valid, acc_valid = self.extract_cost_and_acc('Valid', valid_feed_dict) costs_valid.append(cost_valid) accs_valid.append(acc_valid) hesse_matrix_train = self.sess.run(hessian, feed_dict=train_feed_dict) hesse_matrixes_train.append(hesse_matrix_train) hesse_matrix_valid = self.sess.run(hessian, feed_dict=valid_feed_dict) hesse_matrixes_valid.append(hesse_matrix_valid) surface_values_train = self.calculate_loss_surface(optimizer, hesse_matrix_train, train_feed_dict) surface_values_valid = self.calculate_loss_surface(optimizer, hesse_matrix_valid, valid_feed_dict) save_lists = [norms_train, hesse_matrixes_train, costs_train, accs_train, surface_values_train, norms_valid, hesse_matrixes_valid, costs_valid, accs_valid, surface_values_valid] reslut_names = ['grad_norm_train', 'hessian_train', 'costs_train', 'accuracy_train', 'surface_values_train', 'grad_norm_valid', 'hessian_valid', 'costs_valid', 'accuracy_valid', 'surface_values_valid'] for (save_list, reslut_name) in zip(save_lists, reslut_names): self.save_result(save_list=save_list, reslut_name=reslut_name)
normalize=True, one_hot_label=True) iters_num = 2000 train_size = x_train.shape[0] batch_size = 100 train_loss = {} for key, weight_type in weight_init_types.items(): network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100], output_size=10, weight_init_std=weight_type) optimizer = SGD() train_loss[key] = [] for i in range(iters_num): mask = np.random.choice(train_size, batch_size) x_batch = x_train[mask] t_batch = t_train[mask] grads = network.gradient(x_batch, t_batch) optimizer.update(network.params, grads) train_loss[key].append(network.loss(x_batch, t_batch)) markers = {'std=0.01': 'o', 'Xavier': 's', 'He': 'D'} x = np.arange(iters_num) for key in weight_init_types.keys(): plt.plot(x, train_loss[key], marker=markers[key], markevery=100, label=key) plt.xlabel("iterations") plt.ylabel("loss") plt.ylim(0, 2.5) plt.legend() plt.show()
class RNN: def __init__(self, input_size, hidden_size, labels): self.input_size = input_size self.hidden_size = hidden_size self.output_size = len(labels) self.labels = labels self.label_index = {labels[i]: i for i in range(len(labels))} self.variables = self._init_variables() self.optimizer = SGD(self.variables) def _init_variables(self): variables = { 'w': Variable(np.random.randn(self.input_size, self.hidden_size)), 'w_rec': Variable(np.random.randn(self.hidden_size, self.hidden_size)), 'b': Variable(np.random.randn(self.hidden_size)), 'c': Variable(np.random.randn(self.output_size)) } return variables def _calculate(self, input_list): hidden = Variable(np.zeros(self.hidden_size)) output_list = [] for x in input_list: hidden = Add( Mul( Sigmoid( Add(Mul(Variable(x), self.variables['w']), self.variables['b'])), hidden), self.variables['c']) output_list.append(hidden) return output_list def fit(self, input_list, label, update=True): output_list = self._calculate(input_list) t_node = self.t_node(label) graph = np.zeros(self.hidden_size) for hidden in output_list: graph = Add(graph, SoftmaxLoss(hidden, t_node)) loss = graph.forward() graph.backward(1) if update: self.optimizer.update() return loss def predict(self, input_list): output_list = self._calculate(input_list) results = [] for hidden in output_list: y = hidden.forward() max_index = np.argmax(y) results.append(self.labels[max_index]) return results def t_node(self, label): index = self.label_index[label] t = np.zeros(self.output_size) t[index] = 1.0 return Variable(t)