def test(): # trained model directory directory = "./preTrained" filename = "ddpg" # initialize DDPG agent agent = DDPG( state_dim=env.observation_space.shape[0] - 1, # state_dim=env.observation_space.shape[0] action_dim=env.action_space.shape[0], action_bounds=env.action_space.high[0], lr=0) # load trained agent assert os.path.exists( directory), "Trained model not exists, try running train.py first." agent.load(directory, filename) for epoch in range(max_episode): # reset environment state = env.reset() done = False rewards = 0 while not done: action = agent.select_action(state) # perform one step update on pendulum next_state, reward, done, _ = env.step(action) # go to next state state = next_state rewards += reward # render envrionment env.render() print("Episode:{:2d}, Rewards:{:3f}".format(epoch, rewards))
def get_policy(arglist, kwargs, max_action): # Initialize policy if arglist.policy == "td3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = arglist.policy_noise * max_action kwargs["noise_clip"] = arglist.noise_clip * max_action kwargs["policy_freq"] = arglist.policy_freq policy = TD3.TD3(**kwargs) elif arglist.policy == "OurDDPG": policy = OurDDPG.DDPG(**kwargs) elif arglist.policy == "DDPG": policy = DDPG.DDPG(**kwargs) elif arglist.policy == 'adv': kwargs['alpha'] = arglist.alpha kwargs['adv_epsilon'] = arglist.adv_epsilon kwargs[ 'logdir'] = f'./tensorboard/{arglist.policy}_{arglist.env}_{arglist.seed}/' policy = TD3_adv2.TD3(**kwargs) else: raise NotImplementedError return policy
def train(env_name, start_episodes, num_episodes, gamma, tau, noise_std, batch_size, eval_freq, seed): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' start_episodes: how many episodes purely random policy is run for num_episodes: maximum number of episodes to run gamma: reward discount factor tau: target network update rate batch_size: number of episodes per policy training batch eval_freq: number of training batch before test seed: random seed for all modules with randomness """ # set seeds set_global_seed(seed) # configure log configure_log_info(env_name, seed) # create env env = gym.make(env_name) env.seed(seed) # set env seed obs_dim = env.observation_space.shape[0] obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) act_dim = env.action_space.shape[0] # create actor and target actor actor = Actor(obs_dim, act_dim, float(env.action_space.high[0])).to(device) target_actor = Actor(obs_dim, act_dim, float(env.action_space.high[0])).to(device) # create critic and target critic critic = Critic(obs_dim, act_dim).to(device) target_critic = Critic(obs_dim, act_dim).to(device) # create DDPG agent (hollowed object) agent = DDPG(actor, critic, target_actor, target_critic, noise_std, gamma, tau) agent.align_target() # create replay_buffer replay_buffer = ReplayBuffer() # run a few episodes of untrained policy to initialize scaler and fill in replay buffer run_policy(env, agent, replay_buffer, mode="random", episodes=start_episodes) num_iteration = num_episodes // eval_freq current_episodes = 0 current_steps = 0 for iter in range(num_iteration): # train models for i in range(eval_freq): # sample transitions train_returns, total_steps = run_policy(env, agent, replay_buffer, mode="train", episodes=batch_size) current_episodes += batch_size current_steps += total_steps logger.info('[train] average return:{0}, std return: {1}'.format( np.mean(train_returns), np.std(train_returns))) # train num_epoch = total_steps // batch_size for e in range(num_epoch): observation, action, reward, next_obs, done = replay_buffer.sample( ) agent.update(observation, action, reward, next_obs, done) # test models num_test_episodes = 10 returns, _ = run_policy(env, agent, replay_buffer, mode="test", episodes=num_test_episodes) avg_return = np.mean(returns) std_return = np.std(returns) logger.record_tabular('iteration', iter) logger.record_tabular('episodes', current_episodes) logger.record_tabular('steps', current_steps) logger.record_tabular('avg_return', avg_return) logger.record_tabular('std_return', std_return) logger.dump_tabular()
"state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": args.discount, "tau": args.tau, } # Initialize policy if args.policy == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = args.policy_noise kwargs["noise_clip"] = args.noise_clip kwargs["policy_freq"] = args.policy_freq policy = TD3.TD3(**kwargs) elif args.policy == "DDPG": policy = DDPG.DDPG(**kwargs) if args.load_model != "": policy_file = file_name if args.load_model == "default" else args.load_model policy.load(f"./checkpoint/{policy_file}") replay_buffer = ReplayBuffer(state_dim, action_dim) # Evaluate untrained policy evaluations = [] # evaluations = [eval_policy(policy, env, args.seed, group_name)] # state, done = env.reset(group_name), False episode_reward = 0 episode_Rsim = 0 episode_Robs = 0
from models import DDPG model = DDPG("BotGym-v0") #model = DDPG("Pendulum-v0") model.train(RENDER=True, MAX_EPISODES=10000000)
def train(): # save trained model under preTrained directory directory = "./preTrained" filename = "ddpg" # set epsilon exploration rate and decay rate epsilon = 0.2 eps_min = 1e-3 eps_decay = 2e-3 gaussian_exploration_noise = 0.2 # set learning rate and batch size lr = 1e-3 batch_size = 128 # initialize replay memory replay_buffer = ReplayBuffer(max_size=5e4) # rewards for each episode / for plot rewards = np.zeros(max_episode) # initialize DDPG agent agent = DDPG( state_dim=env.observation_space.shape[0] - 1, # state_dim=env.observation_space.shape[0] action_dim=env.action_space.shape[0], action_bounds=env.action_space.high[0], lr=lr) for epoch in range(max_episode): # reset environment state = env.reset() done = False # epsilon decay epsilon = eps_min if (epsilon - eps_decay) < 0 else (epsilon - eps_decay) while not done: if np.random.random_sample() > epsilon: action = agent.select_action(state) action = action + np.random.normal(0, gaussian_exploration_noise) else: action = np.array( np.random.uniform(env.action_space.low[0], env.action_space.high[0])).reshape(1, ) # perform one step update on pendulum next_state, reward, done, _ = env.step(action) env.render() replay_buffer.add((state, action, reward, next_state, done)) # go to next state state = next_state # store rewards rewards[epoch] += reward # update the DDPG agent sampled on replay buffer and n_iter times agent.update(buffer=replay_buffer, n_iter=10, batch_size=batch_size) if rewards[epoch] > -1.0: print("task solved!\n") # save trained agent if not os.path.exists(directory): os.mkdir(directory) agent.save(directory, filename) # print rewards of current episode if epoch % 10 == 0: print('train epoch:', epoch, 'rewards:', rewards[epoch]) return rewards