#print(param) elif name.find('bias') != -1: # bias init nn.init.normal(param, mean=0, std=1e-2) #print(param) else: print('Init error') # Create Data buffer exp_data = DataBuffer(env, max_trajectory = num_iter_algo*10 +n_rnd, shaping_state_delta = shaping_state_delta) # during first n_rnd trials, apply randomized controls for i in range(n_rnd): exp_data.push(rollout(env, randpol, max_steps=T)) #cost_mean ,cost_std = test_episodic_cost(env, policy, N=50, T=T, render=False) for i in range( num_iter_algo): log.infov('-----------------DeepPILCO Iteration # {}-----------------'.format(i+1)) # Train dynamics train_dynamics_model_pilco(dynamics, dynamics_optimizer, exp_data, epochs=num_itr_dyn, batch_size=dyn_batch_size, plot_train=None, pre_process=pre_process) # Update policy log.infov('Policy optimization...' ) policy.update_dataset_statistics(exp_data) for j in range(num_iter_policy): _, list_costs, list_moments = learn_policy_pilco(env, dynamics, policy, policy_optimizer, K=K, T= 1000, gamma=0.99,
dyn_model=dynamics, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=simulated_paths, action_noise=action_noise, N_SAMPLES=10, ) # Create Data buffer exp_data = DataBuffer( env, max_trajectory=n_rnd + 3 * N_MPC) # num_iter_algo +n_rnd n_rnd + n_iter_algo* N_MPC # during first n_rnd trials, apply randomized controls for i in range(n_rnd): exp_data.push(rollout(env, randpol, max_steps=max_timestep)) log.infov( '-----------------DeepPILCO Iteration # {}-----------------'.format(i + 1)) #Train dynamics train_dynamics_model_pilco(dynamics, dynamics_optimizer, exp_data, epochs=num_itr_dyn, batch_size=dyn_batch_size, plot_train=None, pre_process=pre_process) #dynamics.update_dataset_statistics(exp_data) # Save model save_dir = log_dir utils.save_net_param(dynamics, save_dir, name='dyn_model0', mode='net')
## PPO init state_dim = env.observation_space.shape[0] value_net = Value(state_dim).cuda() optimizer_value = torch.optim.Adam(value_net.parameters(), lr=ppo_args['learning_rate']) # Create Data buffer exp_data = DataBuffer(env, max_trajectory=num_iter_algo * 10 + n_rnd, shaping_state_delta=shaping_state_delta) memory = Memory() # during first n_rnd trials, apply randomized controls for i in range(n_rnd): exp_data.push(rollout(env, randpol, max_steps=T, memory=memory)) #cost_mean ,cost_std = test_episodic_cost(env, policy, N=50, T=T, render=False) policy_PPOoptimizer = torch.optim.Adam(policy.parameters(), lr=ppo_args['learning_rate']) # update_PPO_params(memory.sample(), 0, value_net, policy,optimizer_value, policy_PPOoptimizer, ppo_args) update_PPO_params(memory.sample(), 0, value_net, policy, optimizer_value, policy_PPOoptimizer, ppo_args) for i in range(num_iter_algo): log.infov( '-----------------DeepPILCO Iteration # {}-----------------'.format(i + 1)) # Train dynamics train_dynamics_model_pilco(dynamics,
hidden_size=[hidden_size] * num_hidden_layers, drop_prob=drop_p, activation=net_activation).cuda() dynamics_optimizer = torch.optim.Adam(dynamics.parameters(), lr=lr_dynamics, weight_decay=dyn_reg2) # Create random policy randpol = controller.RandomPolicy(env) # Create Data buffer exp_data = DataBuffer(env, max_trajectory=100) # during first n_rnd trials, apply randomized controls for i in range(n_rnd): exp_data.push(rollout(env, randpol, max_steps=T, render=False)) log.infov('-----------------DeepPILCO Iteration # {}-----------------') # Train dynamics train_dynamics_model_pilco(dynamics, dynamics_optimizer, exp_data, epochs=num_itr_dyn, batch_size=dyn_batch_size, plot_train=None, pre_process=pre_process, logger=logger) #plot_train_ion # Save model