def test_selectedrows_gradient2(self): places = [fluid.CPUPlace()] if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for place in places: for sort_sum_gradient in [True, False]: with fluid.dygraph.guard(place): backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = sort_sum_gradient grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0) input_word = np.array([[1, 2], [2, 1]]).astype('int64') input = to_variable(input_word) simplenet = SimpleNet(20, 32, "float32") adam = SGDOptimizer( learning_rate=0.001, parameter_list=simplenet.parameters(), grad_clip=grad_clip) input_emb, emb = simplenet(input) self.assertTrue(emb.weight.gradient() is None) self.assertTrue(input_emb.gradient() is None) input_emb.backward(backward_strategy) adam.minimize(input_emb) self.assertTrue(emb.weight.gradient() is not None) emb.clear_gradients() self.assertTrue(emb.weight.gradient() is None) input_emb.clear_gradient() self.assertTrue(input_emb.gradient() is not None)
def test_save_load_persistables(self): seed = 90 epoch_num = 1 batch_size = 128 with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed mnist = MNIST("mnist") sgd = SGDOptimizer(learning_rate=1e-3) batch_py_reader = fluid.io.PyReader(capacity=1) batch_py_reader.decorate_sample_list_generator( paddle.batch(self.reader_decorator( paddle.dataset.mnist.train()), batch_size=batch_size, drop_last=True), places=fluid.CPUPlace()) dy_param_init_value = {} for epoch in range(epoch_num): for batch_id, data in enumerate(batch_py_reader()): img = data[0] label = data[1] label.stop_gradient = True cost = mnist(img) loss = fluid.layers.cross_entropy(cost, label) avg_loss = fluid.layers.mean(loss) dy_out = avg_loss.numpy() avg_loss.backward() sgd.minimize(avg_loss) fluid.dygraph.save_persistables(mnist.state_dict(), "save_dir") mnist.clear_gradients() for param in mnist.parameters(): dy_param_init_value[param.name] = param.numpy() restore, _ = fluid.dygraph.load_persistables("save_dir") self.assertRaises(IOError, fluid.dygraph.load_persistables, "not_exist_dir") mnist.load_dict(restore) self.assertEqual(len(dy_param_init_value), len(restore)) for ky, value in restore.items(): self.assertTrue( np.allclose(value.numpy(), dy_param_init_value[value.name])) self.assertTrue(np.isfinite(value.numpy().all())) self.assertFalse(np.isnan(value.numpy().any())) if batch_id > 10: break
def test_selectedrows_gradient1(self): places = [fluid.CPUPlace()] if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for place in places: for dtype in ["float32", "float64"]: for sort_sum_gradient in [True, False]: paddle.disable_static(place) fluid.set_flags( {'FLAGS_sort_sum_gradient': sort_sum_gradient}) # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0) input_word = np.array([[1, 2], [2, 1]]).astype('int64') input = paddle.to_tensor(input_word) simplenet = SimpleNet(20, 32, dtype) adam = SGDOptimizer(learning_rate=0.001, parameter_list=simplenet.parameters() ) # grad_clip=grad_clip input_emb, emb = simplenet(input) self.assertTrue(emb.weight.gradient() is None) self.assertTrue(input_emb.gradient() is None) input_emb.backward() adam.minimize(input_emb) self.assertTrue(emb.weight.gradient() is not None) emb.clear_gradients() self.assertTrue(emb.weight.gradient() is None) input_emb.clear_gradient() self.assertTrue(input_emb.gradient() is not None) paddle.enable_static()
def _check_mlp(self): seed = 90 with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed mlp = MLP('mlp') optimizer = self.get_optimizer() optimizer2 = SGDOptimizer( learning_rate=fluid.layers.natural_exp_decay(learning_rate=0.1, decay_steps=10000, decay_rate=0.5, staircase=True)) train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=128, drop_last=True) for batch_id, data in enumerate(train_reader()): dy_x_data = np.array([x[0].reshape(1, 28, 28) for x in data]).astype('float32') y_data = np.array([x[1] for x in data ]).astype('int64').reshape(128, 1) img = to_variable(dy_x_data) label = to_variable(y_data) label._stop_gradient = True cost = mlp(img) avg_loss = fluid.layers.reduce_mean(cost) avg_loss.backward() optimizer.minimize(avg_loss) optimizer2.minimize(avg_loss) mlp.clear_gradients() fluid.dygraph.save_persistables(mlp.state_dict(), "save_dir_2", [optimizer, optimizer2]) if batch_id == 2: break with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed mlp_load = MLP('mlp') optimizer_load1 = self.get_optimizer() optimizer_load2 = SGDOptimizer( learning_rate=fluid.layers.natural_exp_decay(learning_rate=0.1, decay_steps=10000, decay_rate=0.5, staircase=True)) parameters, optimizers = fluid.dygraph.load_persistables( "save_dir_2") mlp_load.load_dict(parameters) optimizer_load1.load(optimizers) optimizer_load2.load(optimizers) self.assertTrue(optimizer._learning_rate.__dict__ == optimizer_load1._learning_rate.__dict__) self.assertTrue(optimizer2._learning_rate.__dict__ == optimizer_load2._learning_rate.__dict__)
def get_optimizer(self): optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay( learning_rate=0.1, decay_steps=10000, decay_rate=0.5, staircase=True)) return optimizer
def get_optimizer_dygraph(self, parameter_list): bd = [3, 6, 9] optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]), parameter_list=parameter_list) return optimizer
def test_selectedrows_gradient1(self): places = [fluid.CPUPlace()] if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for place in places: for dtype in ["float32", "float64"]: for sort_sum_gradient in [True, False]: with fluid.dygraph.guard(place): backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = sort_sum_gradient # grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0) input_word = np.array([[1, 2], [2, 1]]).astype('int64') input = to_variable(input_word) simplenet = SimpleNet(20, 32, dtype) adam = SGDOptimizer( learning_rate=0.001, parameter_list=simplenet.parameters()) input_emb, emb = simplenet(input) try: emb.weight.gradient() except ValueError as e: assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str( e) try: input_emb.gradient() except ValueError as e: assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str( e) input_emb.backward(backward_strategy) adam.minimize(input_emb) # grad_clip=grad_clip emb.weight.gradient() emb.clear_gradients() try: emb.weight.gradient() except ValueError as e: assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str( e) input_emb.clear_gradient() input_emb.gradient()
def get_optimizer_dygraph(self, parameter_list): optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay( learning_rate=0.1, decay_steps=10000, decay_rate=0.5, staircase=True), parameter_list=parameter_list) return optimizer
def run_dygraph(): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) policy = Policy(input_size=4) dy_state = fluid.dygraph.base.to_variable(state) dy_state.stop_gradient = True loss_probs = policy(dy_state) dy_mask = fluid.dygraph.base.to_variable(mask) dy_mask.stop_gradient = True loss_probs = fluid.layers.log(loss_probs) loss_probs = fluid.layers.elementwise_mul(loss_probs, dy_mask) loss_probs = fluid.layers.reduce_sum(loss_probs, dim=-1) dy_reward = fluid.dygraph.base.to_variable(reward) dy_reward.stop_gradient = True loss_probs = fluid.layers.elementwise_mul(dy_reward, loss_probs) loss = fluid.layers.reduce_sum(loss_probs) sgd = SGDOptimizer(learning_rate=1e-3, parameter_list=policy.parameters()) dy_param_init_value = {} dy_out = loss.numpy() for param in policy.parameters(): dy_param_init_value[param.name] = param.numpy() loss.backward() sgd.minimize(loss) policy.clear_gradients() dy_param_value = {} for param in policy.parameters(): dy_param_value[param.name] = param.numpy() return dy_out, dy_param_init_value, dy_param_value
def func_test_parameter_list(self): with fluid.dygraph.guard(): linear_1 = Linear(10, 10) linear_2 = Linear(10, 10) sgd = SGDOptimizer( 1.0, parameter_list=itertools.chain(linear_1.parameters(), linear_2.parameters())) in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") in_data = fluid.dygraph.to_variable(in_np) y = linear_1(in_data) y = linear_2(y) loss = fluid.layers.reduce_mean(y) loss.backward() sgd.minimize(loss) self.assertTrue( len(sgd._parameter_list) == len(linear_1.parameters() + linear_2.parameters()))
def get_optimizer_dygraph(self, parameter_list): optimizer = SGDOptimizer( learning_rate=fluid.layers.noam_decay( d_model=512, warmup_steps=8000), parameter_list=parameter_list) return optimizer
def get_optimizer(self): optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay( d_model=512, warmup_steps=8000)) return optimizer
def get_optimizer_dygraph(self, parameter_list): optimizer = SGDOptimizer( learning_rate=fluid.layers.cosine_decay( learning_rate=0.1, step_each_epoch=10000, epochs=120), parameter_list=parameter_list) return optimizer
def get_optimizer(self): optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay( learning_rate=0.1, step_each_epoch=10000, epochs=120)) return optimizer
def train_ptb_lm(): args = parse_args() # check if set use_gpu=True in paddlepaddle cpu version model_check.check_cuda(args.use_gpu) place = core.CPUPlace() if args.use_gpu == True: place = core.CUDAPlace(0) # check if paddlepaddle version is satisfied model_check.check_version() model_type = args.model_type vocab_size = 10000 if model_type == "test": num_layers = 1 batch_size = 2 hidden_size = 10 num_steps = 3 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 1 max_epoch = 1 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "small": num_layers = 2 batch_size = 20 hidden_size = 200 num_steps = 20 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 4 max_epoch = 13 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "medium": num_layers = 2 batch_size = 20 hidden_size = 650 num_steps = 35 init_scale = 0.05 max_grad_norm = 5.0 epoch_start_decay = 6 max_epoch = 39 dropout = 0.5 lr_decay = 0.8 base_learning_rate = 1.0 elif model_type == "large": num_layers = 2 batch_size = 20 hidden_size = 1500 num_steps = 35 init_scale = 0.04 max_grad_norm = 10.0 epoch_start_decay = 14 max_epoch = 55 dropout = 0.65 lr_decay = 1.0 / 1.15 base_learning_rate = 1.0 else: print("model type not support") return with fluid.dygraph.guard(place): if args.ce: print("ce mode") seed = 33 np.random.seed(seed) fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed max_epoch = 1 ptb_model = PtbModel( hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, dropout=dropout) if args.init_from_pretrain_model: if not os.path.exists(args.init_from_pretrain_model + '.pdparams'): print(args.init_from_pretrain_model) raise Warning("The pretrained params do not exist.") return fluid.load_dygraph(args.init_from_pretrain_model) print("finish initing model from pretrained params from %s" % (args.init_from_pretrain_model)) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None data_path = args.data_path print("begin to load data") ptb_data = reader.get_ptb_data(data_path) print("finished load data") train_data, valid_data, test_data = ptb_data batch_len = len(train_data) // batch_size total_batch_size = (batch_len - 1) // num_steps log_interval = 200 bd = [] lr_arr = [1.0] for i in range(1, max_epoch): bd.append(total_batch_size * i) new_lr = base_learning_rate * (lr_decay** max(i + 1 - epoch_start_decay, 0.0)) lr_arr.append(new_lr) sgd = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr), parameter_list=ptb_model.parameters()) def eval(model, data): print("begin to eval") total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') model.eval() train_data_iter = reader.get_data_iter(data, batch_size, num_steps) for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, num_steps, 1)) x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, init_cell) out_loss = dy_loss.numpy() init_hidden_data = last_hidden.numpy() init_cell_data = last_cell.numpy() total_loss += out_loss iters += num_steps print("eval finished") ppl = np.exp(total_loss / iters) print("ppl ", batch_id, ppl[0]) if args.ce: print("kpis\ttest_ppl\t%0.3f" % ppl[0]) grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm) for epoch_id in range(1): ptb_model.train() total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') train_data_iter = reader.get_data_iter(train_data, batch_size, num_steps) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) start_time = time.time() start = time.time() for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, num_steps, 1)) x = to_variable(x_data) y = to_variable(y_data) dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, init_cell) init_hidden = last_hidden init_cell = last_cell init_hidden.stop_gradient = True init_cell.stop_gradient = True out_loss = dy_loss.numpy() dy_loss.backward() sgd.minimize(dy_loss, grad_clip=grad_clip) ptb_model.clear_gradients() total_loss += out_loss iters += num_steps if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print("-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f" % (epoch_id, batch_id, ppl[0], sgd._global_learning_rate().numpy(), out_loss)) end = time.time() print("One epoch cost {}".format(end - start)) print("one epoch finished", epoch_id) print("time cost ", time.time() - start_time) ppl = np.exp(total_loss / iters) print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0])) if batch_size <= 20 and epoch_id == 0 and ppl[0] > 1000: # for bad init, after first epoch, the loss is over 1000 # no more need to continue print("Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch.") print("Abort this training process and please start again.") return if args.ce: print("kpis\ttrain_ppl\t%0.3f" % ppl[0]) save_model_dir = os.path.join(args.save_model_dir, str(epoch_id), 'params') fluid.save_dygraph(ptb_model.state_dict(), save_model_dir) print("Saved model to: %s.\n" % save_model_dir) eval(ptb_model, valid_data) eval(ptb_model, test_data)
def get_optimizer(self): optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay( learning_rate=0.1, decay_steps=5, cycle=self.cycle)) return optimizer
def get_optimizer(self): bd = [3, 6, 9] optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)])) return optimizer
''' Part 3. Train & Save ''' # enable dygraph mode place = fluid.CUDAPlace(0) if USE_CUDA else fluid.CPUPlace() fluid.enable_dygraph(place) fluid.default_main_program().random_seed = 10 fluid.default_startup_program().random_seed = 10 # create network mnist = MNIST() # NOTE: jit.save doesn't save optimizer parameters, # so if use other optimizer like Adam here, # the later train loss will not be equal sgd = SGDOptimizer(learning_rate=0.001, parameter_list=mnist.parameters()) # create train data loader train_reader = paddle.batch(reader_decorator(paddle.dataset.mnist.train()), batch_size=BATCH_SIZE, drop_last=True) train_loader = fluid.io.DataLoader.from_generator(capacity=5) train_loader.set_sample_list_generator(train_reader, places=place) # train for epoch in range(EPOCH_NUM): train_one_epoch(mnist, sgd, train_loader) # save fluid.dygraph.jit.save(layer=mnist, model_path=MODEL_PATH) ''' Part 4. Load & Inference ''' # load model
def train(use_cuda, save_dirname, is_local=True): scale_infer, avg_cost = model() # test program test_program = fluid.default_main_program().clone(for_test=True) sgd_optimizer = SGDOptimizer(learning_rate=0.2) optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.movielens.train(), buf_size=8192), batch_size=BATCH_SIZE) test_reader = paddle.batch( paddle.dataset.movielens.test(), batch_size=BATCH_SIZE) feeding = { 'user_id': 0, 'gender_id': 1, 'age_id': 2, 'job_id': 3, 'movie_id': 4, 'category_id': 5, 'movie_title': 6, 'score': 7 } def func_feed(feeding, data): feed_tensors = {} for (key, idx) in feeding.iteritems(): tensor = fluid.LoDTensor() if key != "category_id" and key != "movie_title": if key == "score": numpy_data = np.array(map(lambda x: x[idx], data)).astype( "float32") else: numpy_data = np.array(map(lambda x: x[idx], data)).astype( "int64") else: numpy_data = map(lambda x: np.array(x[idx]).astype("int64"), data) lod_info = [len(item) for item in numpy_data] offset = 0 lod = [offset] for item in lod_info: offset += item lod.append(offset) numpy_data = np.concatenate(numpy_data, axis=0) tensor.set_lod([lod]) numpy_data = numpy_data.reshape([numpy_data.shape[0], 1]) tensor.set(numpy_data, place) feed_tensors[key] = tensor return feed_tensors def train_loop(main_program): exe.run(framework.default_startup_program()) PASS_NUM = 100 for pass_id in range(PASS_NUM): for batch_id, data in enumerate(train_reader()): # train a mini-batch outs = exe.run(program=main_program, feed=func_feed(feeding, data), fetch_list=[avg_cost]) out = np.array(outs[0]) if (batch_id + 1) % 10 == 0: avg_cost_set = [] for test_data in test_reader(): avg_cost_np = exe.run( program=test_program, feed=func_feed(feeding, test_data), fetch_list=[avg_cost]) avg_cost_set.append(avg_cost_np[0]) break # test only 1 segment for speeding up CI # get test avg_cost test_avg_cost = np.array(avg_cost_set).mean() if test_avg_cost < 6.0: # if avg_cost less than 6.0, we think our code is good. if save_dirname is not None: fluid.io.save_inference_model(save_dirname, [ "user_id", "gender_id", "age_id", "job_id", "movie_id", "category_id", "movie_title" ], [scale_infer], exe) return if math.isnan(float(out[0])): sys.exit("got NaN loss, training failed.") if is_local: train_loop(fluid.default_main_program()) else: port = os.getenv("PADDLE_INIT_PORT", "6174") pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... trainers = int(os.getenv("TRAINERS")) current_endpoint = os.getenv("POD_IP") + ":" + port trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) training_role = os.getenv("TRAINING_ROLE", "TRAINER") t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) exe.run(pserver_startup) exe.run(pserver_prog) elif training_role == "TRAINER": train_loop(t.get_trainer_program())
def test_mnist_float32(self): seed = 90 epoch_num = 1 state = np.random.normal(size=4).astype("float32") state_list = state.tolist() reward = np.random.random(size=[1, 1]).astype("float32") reward_list = reward.tolist() action_list = [1] action = np.array(action_list).astype("float32") mask_list = [[0, 1]] mask = np.array(mask_list).astype("float32") with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed policy = Policy(input_size=4) dy_state = fluid.dygraph.base.to_variable(state) dy_state.stop_gradient = True loss_probs = policy(dy_state) dy_mask = fluid.dygraph.base.to_variable(mask) dy_mask.stop_gradient = True loss_probs = fluid.layers.log(loss_probs) loss_probs = fluid.layers.elementwise_mul(loss_probs, dy_mask) loss_probs = fluid.layers.reduce_sum(loss_probs, dim=-1) dy_reward = fluid.dygraph.base.to_variable(reward) dy_reward.stop_gradient = True loss_probs = fluid.layers.elementwise_mul(dy_reward, loss_probs) loss = fluid.layers.reduce_sum(loss_probs) sgd = SGDOptimizer(learning_rate=1e-3, parameter_list=policy.parameters()) dy_param_init_value = {} dy_out = loss.numpy() for param in policy.parameters(): dy_param_init_value[param.name] = param.numpy() loss.backward() sgd.minimize(loss) policy.clear_gradients() dy_param_value = {} for param in policy.parameters(): dy_param_value[param.name] = param.numpy() with new_program_scope(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) policy = Policy(input_size=4) st_sgd = SGDOptimizer(learning_rate=1e-3) st_state = fluid.layers.data(name='st_state', shape=[4], dtype='float32') st_reward = fluid.layers.data(name='st_reward', shape=[1], dtype='float32') st_mask = fluid.layers.data(name='st_mask', shape=[2], dtype='float32') st_loss_probs = policy(st_state) st_loss_probs = fluid.layers.log(st_loss_probs) st_loss_probs = fluid.layers.elementwise_mul( st_loss_probs, st_mask) st_loss_probs = fluid.layers.reduce_sum(st_loss_probs, dim=-1) st_loss_probs = fluid.layers.elementwise_mul( st_reward, st_loss_probs) st_loss = fluid.layers.reduce_sum(st_loss_probs) st_sgd.minimize(st_loss) # initialize params and fetch them static_param_init_value = {} static_param_name_list = [] for param in policy.parameters(): static_param_name_list.append(param.name) out = exe.run(fluid.default_startup_program(), fetch_list=static_param_name_list) for i in range(len(static_param_name_list)): static_param_init_value[static_param_name_list[i]] = out[i] fetch_list = [st_loss.name] fetch_list.extend(static_param_name_list) out = exe.run(fluid.default_main_program(), feed={ "st_state": state, "st_reward": reward, "st_mask": mask }, fetch_list=fetch_list) static_param_value = {} static_out = out[0] for i in range(1, len(out)): static_param_value[static_param_name_list[i - 1]] = out[i] #self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all())) for key, value in six.iteritems(static_param_init_value): self.assertTrue(np.equal(value, dy_param_init_value[key]).all()) self.assertTrue(np.equal(static_out, dy_out).all()) for key, value in six.iteritems(static_param_value): self.assertTrue(np.equal(value, dy_param_value[key]).all())
def train_ptb_lm(): args = parse_args() model_type = args.model_type vocab_size = 10000 if model_type == "test": num_layers = 1 batch_size = 2 hidden_size = 10 num_steps = 3 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 1 max_epoch = 1 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "small": num_layers = 2 batch_size = 20 hidden_size = 200 num_steps = 20 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 4 max_epoch = 13 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "medium": num_layers = 2 batch_size = 20 hidden_size = 650 num_steps = 35 init_scale = 0.05 max_grad_norm = 5.0 epoch_start_decay = 6 max_epoch = 39 dropout = 0.5 lr_decay = 0.8 base_learning_rate = 1.0 elif model_type == "large": num_layers = 2 batch_size = 20 hidden_size = 1500 num_steps = 35 init_scale = 0.04 max_grad_norm = 10.0 epoch_start_decay = 14 max_epoch = 55 dropout = 0.65 lr_decay = 1.0 / 1.15 base_learning_rate = 1.0 else: print("model type not support") return with fluid.dygraph.guard(core.CUDAPlace(0)): if args.ce: print("ce mode") seed = 33 np.random.seed(seed) fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed max_epoch = 1 ptb_model = PtbModel("ptb_model", hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, dropout=dropout) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None data_path = args.data_path print("begin to load data") ptb_data = reader.get_ptb_data(data_path) print("finished load data") train_data, valid_data, test_data = ptb_data batch_len = len(train_data) // batch_size total_batch_size = (batch_len - 1) // num_steps log_interval = total_batch_size // 20 bd = [] lr_arr = [1.0] for i in range(1, max_epoch): bd.append(total_batch_size * i) new_lr = base_learning_rate * (lr_decay**max( i + 1 - epoch_start_decay, 0.0)) lr_arr.append(new_lr) sgd = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr)) def eval(model, data): print("begion to eval") total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') model.eval() train_data_iter = reader.get_data_iter(data, batch_size, num_steps) for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) out_loss = dy_loss.numpy() init_hidden_data = last_hidden.numpy() init_cell_data = last_cell.numpy() total_loss += out_loss iters += num_steps print("eval finished") ppl = np.exp(total_loss / iters) print("ppl ", batch_id, ppl[0]) if args.ce: print("kpis\ttest_ppl\t%0.3f" % ppl[0]) grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm) for epoch_id in range(max_epoch): ptb_model.train() total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') train_data_iter = reader.get_data_iter(train_data, batch_size, num_steps) start_time = time.time() for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) out_loss = dy_loss.numpy() init_hidden_data = last_hidden.numpy() init_cell_data = last_cell.numpy() dy_loss.backward() sgd.minimize(dy_loss, grad_clip=grad_clip) ptb_model.clear_gradients() total_loss += out_loss iters += num_steps if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print(epoch_id, "ppl ", batch_id, ppl[0], sgd._global_learning_rate().numpy()) print("one ecpoh finished", epoch_id) print("time cost ", time.time() - start_time) ppl = np.exp(total_loss / iters) print("ppl ", epoch_id, ppl[0]) if args.ce: print("kpis\ttrain_ppl\t%0.3f" % ppl[0]) eval(ptb_model, test_data)
def test_mnist_forward_float32(self): epoch_num = 1 with fluid.dygraph.guard(): paddle.manual_seed(SEED) paddle.framework.random._manual_program_seed(SEED) mnist = MNIST() sgd = SGDOptimizer(learning_rate=1e-3, parameter_list=mnist.parameters()) train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=128, drop_last=True) dy_param_init_value = {} mnist.eval() for epoch in range(epoch_num): for batch_id, data in enumerate(train_reader()): dy_x_data = np.array([ x[0].reshape(1, 28, 28) for x in data ]).astype('float32') y_data = np.array([x[1] for x in data ]).astype('int64').reshape(128, 1) img = to_variable(dy_x_data) label = to_variable(y_data) label.stop_gradient = True cost = mnist(img) loss = fluid.layers.cross_entropy(cost, label) avg_loss = fluid.layers.mean(loss) dy_out = avg_loss.numpy() if epoch == 0 and batch_id == 0: for param in mnist.parameters(): dy_param_init_value[param.name] = param.numpy() with new_program_scope(): paddle.manual_seed(SEED) paddle.framework.random._manual_program_seed(SEED) exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) mnist = MNIST() sgd = SGDOptimizer(learning_rate=1e-3) train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=128, drop_last=True) img = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') cost = mnist(img) loss = fluid.layers.cross_entropy(cost, label) avg_loss = fluid.layers.mean(loss) # initialize params and fetch them static_param_init_value = {} static_param_name_list = [] for param in mnist.parameters(): static_param_name_list.append(param.name) out = exe.run(fluid.default_startup_program(), fetch_list=static_param_name_list) for i in range(len(static_param_name_list)): static_param_init_value[static_param_name_list[i]] = out[i] for epoch in range(epoch_num): for batch_id, data in enumerate(train_reader()): static_x_data = np.array([ x[0].reshape(1, 28, 28) for x in data ]).astype('float32') y_data = np.array([x[1] for x in data ]).astype('int64').reshape([128, 1]) fetch_list = [avg_loss.name] out = exe.run(fluid.default_main_program(), feed={ "pixel": static_x_data, "label": y_data }, fetch_list=fetch_list) static_out = out[0] self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all())) for key, value in six.iteritems(static_param_init_value): self.assertTrue(np.allclose(value, dy_param_init_value[key])) self.assertTrue(np.allclose(static_out, dy_out))
def train(use_cuda, save_dirname, is_local=True): scale_infer, avg_cost = model() # test program test_program = fluid.default_main_program().clone(for_test=True) sgd_optimizer = SGDOptimizer(learning_rate=0.2) sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) train_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.movielens.train(), buf_size=8192), batch_size=BATCH_SIZE) test_reader = paddle.batch(paddle.dataset.movielens.test(), batch_size=BATCH_SIZE) feed_order = [ 'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id', 'movie_title', 'score' ] def train_loop(main_program): exe.run(framework.default_startup_program()) feed_list = [ main_program.global_block().var(var_name) for var_name in feed_order ] feeder = fluid.DataFeeder(feed_list, place) PASS_NUM = 100 for pass_id in range(PASS_NUM): for batch_id, data in enumerate(train_reader()): # train a mini-batch outs = exe.run(program=main_program, feed=feeder.feed(data), fetch_list=[avg_cost]) out = np.array(outs[0]) if (batch_id + 1) % 10 == 0: avg_cost_set = [] for test_data in test_reader(): avg_cost_np = exe.run(program=test_program, feed=feeder.feed(test_data), fetch_list=[avg_cost]) avg_cost_set.append(avg_cost_np[0]) break # test only 1 segment for speeding up CI # get test avg_cost test_avg_cost = np.array(avg_cost_set).mean() if test_avg_cost < 6.0: # if avg_cost less than 6.0, we think our code is good. if save_dirname is not None: fluid.io.save_inference_model( save_dirname, [ "user_id", "gender_id", "age_id", "job_id", "movie_id", "category_id", "movie_title" ], [scale_infer], exe) return if math.isnan(float(out[0])): sys.exit("got NaN loss, training failed.") if is_local: train_loop(fluid.default_main_program()) else: port = os.getenv("PADDLE_PSERVER_PORT", "6174") pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... trainers = int(os.getenv("PADDLE_TRAINERS")) current_endpoint = os.getenv("POD_IP") + ":" + port trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER") t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) exe.run(pserver_startup) exe.run(pserver_prog) elif training_role == "TRAINER": train_loop(t.get_trainer_program())
def test_mnist_sort_gradient_float32(self): seed = 90 epoch_num = 1 with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed fluid.set_flags({'FLAGS_sort_sum_gradient': True}) mnist2 = MNIST() sgd2 = SGDOptimizer(learning_rate=1e-3, parameter_list=mnist2.parameters()) train_reader2 = paddle.batch(paddle.dataset.mnist.train(), batch_size=128, drop_last=True) mnist2.train() dy_param_init_value2 = {} for epoch in range(epoch_num): for batch_id, data in enumerate(train_reader2()): dy_x_data2 = np.array([ x[0].reshape(1, 28, 28) for x in data ]).astype('float32') y_data2 = np.array([x[1] for x in data ]).astype('int64').reshape(128, 1) img2 = to_variable(dy_x_data2) label2 = to_variable(y_data2) label2.stop_gradient = True cost2 = mnist2(img2) loss2 = fluid.layers.cross_entropy(cost2, label2) avg_loss2 = fluid.layers.mean(loss2) dy_out2 = avg_loss2.numpy() if epoch == 0 and batch_id == 0: for param in mnist2.parameters(): dy_param_init_value2[param.name] = param.numpy() avg_loss2.backward() sgd2.minimize(avg_loss2) mnist2.clear_gradients() dy_param_value2 = {} for param in mnist2.parameters(): dy_param_value2[param.name] = param.numpy() if batch_id == 20: break with new_program_scope(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) mnist = MNIST() sgd = SGDOptimizer(learning_rate=1e-3) train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=128, drop_last=True) img = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') cost = mnist(img) loss = fluid.layers.cross_entropy(cost, label) avg_loss = fluid.layers.mean(loss) sgd.minimize(avg_loss) # initialize params and fetch them static_param_init_value = {} static_param_name_list = [] for param in mnist.parameters(): static_param_name_list.append(param.name) out = exe.run(fluid.default_startup_program(), fetch_list=static_param_name_list) for i in range(len(static_param_name_list)): static_param_init_value[static_param_name_list[i]] = out[i] for epoch in range(epoch_num): for batch_id, data in enumerate(train_reader()): static_x_data = np.array([ x[0].reshape(1, 28, 28) for x in data ]).astype('float32') y_data = np.array([x[1] for x in data ]).astype('int64').reshape([128, 1]) fetch_list = [avg_loss.name] fetch_list.extend(static_param_name_list) out = exe.run(fluid.default_main_program(), feed={ "pixel": static_x_data, "label": y_data }, fetch_list=fetch_list) static_param_value = {} static_out = out[0] for i in range(1, len(out)): static_param_value[static_param_name_list[i - 1]] = out[i] if batch_id == 20: break self.assertTrue(np.allclose(dy_x_data2.all(), static_x_data.all())) for key, value in six.iteritems(static_param_init_value): self.assertTrue(np.allclose(value, dy_param_init_value2[key])) self.assertTrue(np.allclose(static_out, dy_out2)) for key, value in six.iteritems(static_param_value): self.assertTrue(np.allclose(value, dy_param_value2[key], atol=1e-5))
def get_optimizer_dygraph(self, parameter_list): optimizer = SGDOptimizer( learning_rate=fluid.layers.polynomial_decay( learning_rate=0.1, decay_steps=5, cycle=self.cycle), parameter_list=parameter_list) return optimizer
def simple_net_float32(self, is_sparse, dtype): places = [fluid.CPUPlace()] if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for place in places: seed = 90 hidden_size = 10 vocab_size = 1000 num_steps = 3 init_scale = 0.1 batch_size = 4 batch_num = 200 for is_sort_sum_gradient in [True, False]: with fluid.dygraph.guard(place): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed simple_net = SimpleNet(hidden_size=hidden_size, vocab_size=vocab_size, num_steps=num_steps, init_scale=init_scale, is_sparse=is_sparse, dtype=dtype) sgd = SGDOptimizer(learning_rate=1e-3, parameter_list=simple_net.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None helper = DyGraphProgramDescTracerTestHelper(self) backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = is_sort_sum_gradient for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') x_data = x_data.reshape((-1, num_steps)) y_data = y_data.reshape((-1, 1)) x = to_variable(x_data) y = to_variable(y_data) outs = simple_net(x, y) dy_loss = outs if i == 0: for param in simple_net.parameters(): dy_param_init[param.name] = param.numpy() dy_loss.backward(backward_strategy) sgd.minimize(dy_loss) sgd.clear_gradients() if i == batch_num - 1: for param in simple_net.parameters(): dy_param_updated[param.name] = param.numpy() dy_loss_value = dy_loss.numpy() with new_program_scope(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed simple_net = SimpleNet(hidden_size=hidden_size, vocab_size=vocab_size, num_steps=num_steps, is_sparse=is_sparse, dtype=dtype) exe = fluid.Executor(place) sgd = SGDOptimizer(learning_rate=1e-3) x = fluid.layers.data(name="x", shape=[-1, num_steps], dtype='int64') y = fluid.layers.data(name="y", shape=[-1, 1], dtype=dtype) static_loss = simple_net(x, y) sgd.minimize(static_loss) static_param_updated = dict() static_param_init = dict() static_param_name_list = list() for param in simple_net.parameters(): static_param_name_list.append(param.name) out = exe.run(fluid.default_startup_program(), fetch_list=static_param_name_list) for i in range(len(static_param_name_list)): static_param_init[static_param_name_list[i]] = out[i] static_loss_value = None for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') x_data = x_data.reshape((-1, num_steps)) y_data = y_data.reshape((-1, 1)) fetch_list = [static_loss] fetch_list.extend(static_param_name_list) out = exe.run(fluid.default_main_program(), feed={ "x": x_data, "y": y_data }, fetch_list=fetch_list) static_loss_value = out[0] if i == batch_num - 1: for k in range(3, len(out)): static_param_updated[static_param_name_list[ k - 1]] = out[k] self.assertTrue( np.array_equal(static_loss_value, dy_loss_value)) for key, value in six.iteritems(static_param_init): self.assertTrue(np.array_equal(value, dy_param_init[key])) for key, value in six.iteritems(static_param_updated): self.assertTrue( np.array_equal(value, dy_param_updated[key]))
def ptb_rnn_cpu_float32(self, is_sparse): seed = 90 hidden_size = 10 vocab_size = 1000 num_layers = 1 num_steps = 3 init_scale = 0.1 batch_size = 4 batch_num = 200 traced_layer = None with fluid.dygraph.guard(): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) # TODO: marsyang1993 Change seed to ptb_model = PtbModel(hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, is_sparse=is_sparse) sgd = SGDOptimizer(learning_rate=1e-3, parameter_list=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None helper = DyGraphProgramDescTracerTestHelper(self) program = None for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') y_data = y_data.reshape((-1, 1)) init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) if i % 5 == 0 and _in_legacy_dygraph(): outs, traced_layer = TracedLayer.trace( ptb_model, [x, y, init_hidden, init_cell]) outs_static = traced_layer([x, y, init_hidden, init_cell]) helper.assertEachVar(outs, outs_static) if program is not None: self.assertTrue( is_equal_program(traced_layer.program, program)) program = traced_layer.program traced_layer.save_inference_model( './infe_imperative_ptb_rnn', feed=list(range(4))) else: outs = ptb_model(x, y, init_hidden, init_cell) dy_loss, last_hidden, last_cell = outs if i == 0: for param in ptb_model.parameters(): dy_param_init[param.name] = param.numpy() dy_loss.backward() sgd.minimize(dy_loss) ptb_model.clear_gradients() if i == batch_num - 1: for param in ptb_model.parameters(): dy_param_updated[param.name] = param.numpy() dy_loss_value = dy_loss.numpy() dy_last_cell_value = last_cell.numpy() dy_last_hidden_value = last_hidden.numpy() with new_program_scope(): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) ptb_model = PtbModel(hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, is_sparse=is_sparse) exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) sgd = SGDOptimizer(learning_rate=1e-3) x = fluid.layers.data(name="x", shape=[-1, num_steps], dtype='int64') y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32') init_hidden = fluid.layers.data(name="init_hidden", shape=[1], dtype='float32') init_cell = fluid.layers.data(name="init_cell", shape=[1], dtype='float32') static_loss, static_last_hidden, static_last_cell = ptb_model( x, y, init_hidden, init_cell) sgd.minimize(static_loss) static_param_updated = dict() static_param_init = dict() static_param_name_list = list() for param in ptb_model.parameters(): static_param_name_list.append(param.name) out = exe.run(framework.default_startup_program(), fetch_list=static_param_name_list) for i in range(len(static_param_name_list)): static_param_init[static_param_name_list[i]] = out[i] static_loss_value = None static_last_cell_value = None static_last_hidden_value = None for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') fetch_list = [ static_loss, static_last_hidden, static_last_cell ] fetch_list.extend(static_param_name_list) out = exe.run(fluid.default_main_program(), feed={ "x": x_data, "y": y_data, "init_hidden": init_hidden_data, "init_cell": init_cell_data }, fetch_list=fetch_list) static_loss_value = out[0] static_last_hidden_value = out[1] static_last_cell_value = out[2] if i == batch_num - 1: for k in range(3, len(out)): static_param_updated[static_param_name_list[ k - 3]] = out[k] self.assertTrue(np.array_equal(static_loss_value, dy_loss_value)) self.assertTrue( np.array_equal(static_last_cell_value, dy_last_cell_value)) self.assertTrue( np.array_equal(static_last_hidden_value, dy_last_hidden_value)) for key, value in six.iteritems(static_param_init): self.assertTrue(np.array_equal(value, dy_param_init[key])) for key, value in six.iteritems(static_param_updated): self.assertTrue(np.array_equal(value, dy_param_updated[key]))
def ptb_rnn_sort_gradient_cpu_float32(self, is_sparse): seed = 90 hidden_size = 10 vocab_size = 1000 num_layers = 1 num_steps = 3 init_scale = 0.1 batch_size = 4 batch_num = 200 with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = True # TODO: marsyang1993 Change seed to ptb_model = PtbModel(hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, is_sparse=is_sparse) sgd = SGDOptimizer(learning_rate=1e-3, parameter_list=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) if i == 0: for param in ptb_model.parameters(): dy_param_init[param.name] = param.numpy() dy_loss.backward(backward_strategy) sgd.minimize(dy_loss) ptb_model.clear_gradients() if i == batch_num - 1: for param in ptb_model.parameters(): dy_param_updated[param.name] = param.numpy() dy_loss_value = dy_loss.numpy() dy_last_cell_value = last_cell.numpy() dy_last_hidden_value = last_hidden.numpy() with new_program_scope(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed ptb_model = PtbModel(hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, is_sparse=is_sparse) exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) sgd = SGDOptimizer(learning_rate=1e-3) x = fluid.layers.data(name="x", shape=[-1, num_steps, 1], dtype='int64') y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32') init_hidden = fluid.layers.data(name="init_hidden", shape=[1], dtype='float32') init_cell = fluid.layers.data(name="init_cell", shape=[1], dtype='float32') static_loss, static_last_hidden, static_last_cell = ptb_model( x, y, init_hidden, init_cell) sgd.minimize(static_loss) static_param_updated = dict() static_param_init = dict() static_param_name_list = list() for param in ptb_model.parameters(): static_param_name_list.append(param.name) out = exe.run(framework.default_startup_program(), fetch_list=static_param_name_list) for i in range(len(static_param_name_list)): static_param_init[static_param_name_list[i]] = out[i] static_loss_value = None static_last_cell_value = None static_last_hidden_value = None for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') fetch_list = [ static_loss, static_last_hidden, static_last_cell ] fetch_list.extend(static_param_name_list) out = exe.run(fluid.default_main_program(), feed={ "x": x_data, "y": y_data, "init_hidden": init_hidden_data, "init_cell": init_cell_data }, fetch_list=fetch_list) static_loss_value = out[0] static_last_hidden_value = out[1] static_last_cell_value = out[2] if i == batch_num - 1: for k in range(3, len(out)): static_param_updated[static_param_name_list[ k - 3]] = out[k] self.assertTrue(np.array_equal(static_loss_value, dy_loss_value)) self.assertTrue( np.array_equal(static_last_cell_value, dy_last_cell_value)) self.assertTrue( np.array_equal(static_last_hidden_value, dy_last_hidden_value)) for key, value in six.iteritems(static_param_init): self.assertTrue(np.array_equal(value, dy_param_init[key])) for key, value in six.iteritems(static_param_updated): self.assertTrue(np.array_equal(value, dy_param_updated[key]))
def test_gan_float32(self): seed = 90 startup = fluid.Program() startup.random_seed = seed discriminate_p = fluid.Program() generate_p = fluid.Program() discriminate_p.random_seed = seed generate_p.random_seed = seed scope = fluid.core.Scope() with new_program_scope(main=discriminate_p, startup=startup, scope=scope): discriminator = Discriminator() generator = Generator() img = fluid.layers.data(name="img", shape=[2, 1], append_batch_size=False) noise = fluid.layers.data(name="noise", shape=[2, 2], append_batch_size=False) d_real = discriminator(img) d_loss_real = fluid.layers.reduce_mean( fluid.layers.sigmoid_cross_entropy_with_logits( x=d_real, label=fluid.layers.fill_constant(shape=[2, 1], dtype='float32', value=1.0))) d_fake = discriminator(generator(noise)) d_loss_fake = fluid.layers.reduce_mean( fluid.layers.sigmoid_cross_entropy_with_logits( x=d_fake, label=fluid.layers.fill_constant(shape=[2, 1], dtype='float32', value=0.0))) d_loss = d_loss_real + d_loss_fake sgd = SGDOptimizer(learning_rate=1e-3) sgd.minimize(d_loss) with new_program_scope(main=generate_p, startup=startup, scope=scope): discriminator = Discriminator() generator = Generator() noise = fluid.layers.data(name="noise", shape=[2, 2], append_batch_size=False) d_fake = discriminator(generator(noise)) g_loss = fluid.layers.reduce_mean( fluid.layers.sigmoid_cross_entropy_with_logits( x=d_fake, label=fluid.layers.fill_constant(shape=[2, 1], dtype='float32', value=1.0))) sgd = SGDOptimizer(learning_rate=1e-3) sgd.minimize(g_loss) exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) static_params = dict() with fluid.scope_guard(scope): img = np.ones([2, 1], np.float32) noise = np.ones([2, 2], np.float32) exe.run(startup) static_d_loss = exe.run(discriminate_p, feed={ 'img': img, 'noise': noise }, fetch_list=[d_loss])[0] static_g_loss = exe.run(generate_p, feed={'noise': noise}, fetch_list=[g_loss])[0] # generate_p contains all parameters needed. for param in generate_p.global_block().all_parameters(): static_params[param.name] = np.array( scope.find_var(param.name).get_tensor()) dy_params = dict() with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed discriminator = Discriminator() generator = Generator() sgd = SGDOptimizer(learning_rate=1e-3, parameter_list=(discriminator.parameters() + generator.parameters())) d_real = discriminator(to_variable(np.ones([2, 1], np.float32))) d_loss_real = fluid.layers.reduce_mean( fluid.layers.sigmoid_cross_entropy_with_logits( x=d_real, label=to_variable(np.ones([2, 1], np.float32)))) d_fake = discriminator( generator(to_variable(np.ones([2, 2], np.float32)))) d_loss_fake = fluid.layers.reduce_mean( fluid.layers.sigmoid_cross_entropy_with_logits( x=d_fake, label=to_variable(np.zeros([2, 1], np.float32)))) d_loss = d_loss_real + d_loss_fake d_loss.backward() sgd.minimize(d_loss) discriminator.clear_gradients() generator.clear_gradients() d_fake = discriminator( generator(to_variable(np.ones([2, 2], np.float32)))) g_loss = fluid.layers.reduce_mean( fluid.layers.sigmoid_cross_entropy_with_logits( x=d_fake, label=to_variable(np.ones([2, 1], np.float32)))) g_loss.backward() sgd.minimize(g_loss) for p in discriminator.parameters(): dy_params[p.name] = p.numpy() for p in generator.parameters(): dy_params[p.name] = p.numpy() dy_g_loss = g_loss.numpy() dy_d_loss = d_loss.numpy() dy_params2 = dict() with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = True discriminator2 = Discriminator() generator2 = Generator() sgd2 = SGDOptimizer(learning_rate=1e-3, parameter_list=(discriminator2.parameters() + generator2.parameters())) d_real2 = discriminator2(to_variable(np.ones([2, 1], np.float32))) d_loss_real2 = fluid.layers.reduce_mean( fluid.layers.sigmoid_cross_entropy_with_logits( x=d_real2, label=to_variable(np.ones([2, 1], np.float32)))) d_fake2 = discriminator2( generator2(to_variable(np.ones([2, 2], np.float32)))) d_loss_fake2 = fluid.layers.reduce_mean( fluid.layers.sigmoid_cross_entropy_with_logits( x=d_fake2, label=to_variable(np.zeros([2, 1], np.float32)))) d_loss2 = d_loss_real2 + d_loss_fake2 d_loss2.backward(backward_strategy) sgd2.minimize(d_loss2) discriminator2.clear_gradients() generator2.clear_gradients() d_fake2 = discriminator2( generator2(to_variable(np.ones([2, 2], np.float32)))) g_loss2 = fluid.layers.reduce_mean( fluid.layers.sigmoid_cross_entropy_with_logits( x=d_fake2, label=to_variable(np.ones([2, 1], np.float32)))) g_loss2.backward(backward_strategy) sgd2.minimize(g_loss2) for p in discriminator2.parameters(): dy_params2[p.name] = p.numpy() for p in generator.parameters(): dy_params2[p.name] = p.numpy() dy_g_loss2 = g_loss2.numpy() dy_d_loss2 = d_loss2.numpy() self.assertEqual(dy_g_loss, static_g_loss) self.assertEqual(dy_d_loss, static_d_loss) for k, v in six.iteritems(dy_params): self.assertTrue(np.allclose(v, static_params[k])) self.assertEqual(dy_g_loss2, static_g_loss) self.assertEqual(dy_d_loss2, static_d_loss) for k, v in six.iteritems(dy_params2): self.assertTrue(np.allclose(v, static_params[k]))
def test_ptb_rnn_cpu_bfloat16(self): seed = 90 hidden_size = 10 vocab_size = 500 num_layers = 1 num_steps = 3 init_scale = 0.1 batch_size = 4 batch_num = 100 with new_program_scope(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed ptb_model = PtbModel("ptb_model", hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale) place = self.set_place() exe = fluid.Executor(place) sgd = SGDOptimizer(learning_rate=1e-3) x = fluid.layers.data(name="x", shape=[-1, num_steps], dtype='int64') y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32') init_hidden = fluid.layers.data(name="init_hidden", shape=[1], dtype='float32') init_cell = fluid.layers.data(name="init_cell", shape=[1], dtype='float32') static_loss, static_last_hidden, static_last_cell = ptb_model( x, y, init_hidden, init_cell) sgd = paddle.static.amp.bf16.decorate_bf16( sgd, amp_lists=paddle.static.amp.bf16.AutoMixedPrecisionListsBF16( custom_fp32_list={'transpose2', 'concat'}), use_bf16_guard=False, use_pure_bf16=True) sgd.minimize(static_loss, framework.default_startup_program()) out = exe.run(framework.default_startup_program()) for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) #TODO investigate initializing model with "float32" instead of "uint16" as it was before # slice_op PR(datatypes in model graph are different than datatypes during runtime because of that) init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='uint16') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='uint16') fetch_list = [ static_loss, static_last_hidden, static_last_cell ] out = exe.run(fluid.default_main_program(), feed={ "x": x_data, "y": y_data, "init_hidden": init_hidden_data, "init_cell": init_cell_data }, fetch_list=fetch_list) # get value before save main_program = framework.default_main_program() base_map = {} for var in main_program.list_vars(): if isinstance(var, framework.Parameter) or var.persistable: t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) # make sure all the paramerter or optimizer var have been update self.assertTrue(np.sum(np.abs(t)) != 0) base_map[var.name] = t fluid.save(main_program, "./test_1") # set var to zero for var in main_program.list_vars(): if isinstance(var, framework.Parameter) or var.persistable: ten = fluid.global_scope().find_var(var.name).get_tensor() ten.set(np.zeros_like(np.array(ten)), place) new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) # make sure all the paramerter or optimizer var have been set to zero self.assertTrue(np.sum(np.abs(new_t)) == 0) fluid.load(main_program, "./test_1.pdparams", exe) for var in main_program.list_vars(): if isinstance(var, framework.Parameter) or var.persistable: new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] self.assertTrue(np.array_equal(new_t, base_t))
def train(use_cuda, save_dirname, is_local=True): scale_infer, avg_cost = model() # test program test_program = fluid.default_main_program().clone(for_test=True) sgd_optimizer = SGDOptimizer(learning_rate=0.2) sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.movielens.train(), buf_size=8192), batch_size=BATCH_SIZE) test_reader = paddle.batch( paddle.dataset.movielens.test(), batch_size=BATCH_SIZE) feed_order = [ 'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id', 'movie_title', 'score' ] def train_loop(main_program): exe.run(framework.default_startup_program()) feed_list = [ main_program.global_block().var(var_name) for var_name in feed_order ] feeder = fluid.DataFeeder(feed_list, place) PASS_NUM = 100 for pass_id in range(PASS_NUM): for batch_id, data in enumerate(train_reader()): # train a mini-batch outs = exe.run(program=main_program, feed=feeder.feed(data), fetch_list=[avg_cost]) out = np.array(outs[0]) if (batch_id + 1) % 10 == 0: avg_cost_set = [] for test_data in test_reader(): avg_cost_np = exe.run(program=test_program, feed=feeder.feed(test_data), fetch_list=[avg_cost]) avg_cost_set.append(avg_cost_np[0]) break # test only 1 segment for speeding up CI # get test avg_cost test_avg_cost = np.array(avg_cost_set).mean() if test_avg_cost < 6.0: # if avg_cost less than 6.0, we think our code is good. if save_dirname is not None: fluid.io.save_inference_model(save_dirname, [ "user_id", "gender_id", "age_id", "job_id", "movie_id", "category_id", "movie_title" ], [scale_infer], exe) return if math.isnan(float(out[0])): sys.exit("got NaN loss, training failed.") if is_local: train_loop(fluid.default_main_program()) else: port = os.getenv("PADDLE_INIT_PORT", "6174") pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... trainers = int(os.getenv("TRAINERS")) current_endpoint = os.getenv("POD_IP") + ":" + port trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) training_role = os.getenv("TRAINING_ROLE", "TRAINER") t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) exe.run(pserver_startup) exe.run(pserver_prog) elif training_role == "TRAINER": train_loop(t.get_trainer_program())