def __init__(self, id, state_size, action_size, seed, memory, num_agents, hyperparameters: Mapping[str, float]): """Initialize a DDPG agent object. Params ====== id (int): agent's id state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed memory (ReplayBuffer): replay buffer to store the experience of this agent hyperparameters (dictionnary of str:): hyperparameters' values of the model. The expected parameters are: - batch_size (int): minibatch size - lr_actor (float): learning rate of the actor - lr_critic (float): learning rate of the critic - gamma (float): discount factor - weight_decay (float): critic L2 weight decay - tau (float): value for soft update of target parameters - update_frequency (int): how much steps must be executed before starting learn - n_learns (int): how many learning for update """ self.id = id self.__name__ = 'DDPG' self.state_size = state_size self.action_size = action_size self.gamma = hyperparameters['gamma'] self.batch_size = int(hyperparameters['batch_size']) self.tau = hyperparameters['tau'] self.update_frequency = int(hyperparameters['update_frequency']) self.n_learns = int(hyperparameters['n_learns']) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=hyperparameters['lr_actor']) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, num_agents, seed).to(device) self.critic_target = Critic(state_size, action_size, num_agents, seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=hyperparameters['lr_critic'], weight_decay=hyperparameters['weight_decay']) # Noise process self.noise = Ornstein(action_size) # Replay memory self.memory = memory # Initialize the time step (for every update_frequency steps) self.t_step = 0
def __init__(self, env, gamma, tau, buffer_maxlen, critic_learning_rate, actor_learning_rate, max_action=1): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.noise = OUNoise(env.action_space) self.iter = 0.0 self.noisy = False self.max_action = max_action print(self.action_dim) print(self.obs_dim) # RL hyperparameters self.env = env self.gamma = gamma self.tau = tau # Initialize critic and actorr networks self.critic = Critic(self.obs_dim, self.action_dim).to(self.device) self.critic_target = Critic(self.obs_dim, self.action_dim).to(self.device) self.actor = Actor(self.obs_dim, self.action_dim, self.max_action).to(self.device) self.actor_target = Actor(self.obs_dim, self.action_dim).to(self.device) # Copy target network paramters for critic for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) # Set Optimization algorithms self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate) self.replay_buffer = ExperienceReplayLog(buffer_maxlen)
def train(env_id, *args, **kwargs): """Train a DDPG model on the given environment.""" env = gym.make(env_id) [obs_dim] = env.observation_space.shape [act_dim] = env.action_space.shape # tf global variables are created here so they will be initialized by # @run_with_sess when we call train_with critic = Critic(obs_dim, act_dim) actor = Actor(obs_dim, act_dim, critic) noise = ornstein_uhlenbeck_noise(np.zeros(act_dim)) return train_with(env, actor, critic, noise, *args, **kwargs)
def __init__(self, logger, obs_dim, action_space, userconfig): super().__init__(logger=logger, obs_dim=obs_dim, action_dim=action_space.shape[0], userconfig=userconfig) self._observation_dim = obs_dim self._action_space = action_space self._action_n = action_space.shape[0] self._config = { "eps": 0.05, "discount": 0.95, "buffer_size": int(1e5), "batch_size": 128, "learning_rate_actor": 0.0002, "learning_rate_critic": 0.0002, "hidden_sizes": [256, 256], 'tau': 0.0001 } self._config.update(userconfig) self._eps = self._config['eps'] self._tau = self._config['tau'] self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.eval_mode = False if self._config['lr_milestones'] is None: raise ValueError( 'lr_milestones argument cannot be None!\nExample: --lr_milestones=100 200 300' ) lr_milestones = [ int(x) for x in (self._config['lr_milestones'][0]).split(' ') ] # Critic self.critic = Critic( self._observation_dim, self._action_n, hidden_sizes=self._config['hidden_sizes'], learning_rate=self._config['learning_rate_critic'], lr_milestones=lr_milestones, lr_factor=self._config['lr_factor'], device=self._config['device']) self.critic_target = Critic( self._observation_dim, self._action_n, hidden_sizes=self._config['hidden_sizes'], learning_rate=self._config['learning_rate_critic'], lr_milestones=lr_milestones, lr_factor=self._config['lr_factor'], device=self._config['device']) # Actor self.actor = Actor(self._observation_dim, self._action_n, hidden_sizes=self._config['hidden_sizes'], learning_rate=self._config['learning_rate_actor'], lr_milestones=lr_milestones, lr_factor=self._config['lr_factor'], device=self._config['device']) self.actor_target = Actor( self._observation_dim, self._action_n, hidden_sizes=self._config['hidden_sizes'], learning_rate=self._config['learning_rate_actor'], lr_milestones=lr_milestones, lr_factor=self._config['lr_factor'], device=self._config['device'])
class DDPGAgent(Agent): """ The DDPGAgent class implements a trainable DDPG agent. Parameters ---------- logger: Logger The variable specifies a logger for model management, plotting and printing. obs_dim: int The variable specifies the dimension of observation space vector. action_space: ndarray The variable specifies the action space of environment. userconfig: The variable specifies the config settings. """ def __init__(self, logger, obs_dim, action_space, userconfig): super().__init__(logger=logger, obs_dim=obs_dim, action_dim=action_space.shape[0], userconfig=userconfig) self._observation_dim = obs_dim self._action_space = action_space self._action_n = action_space.shape[0] self._config = { "eps": 0.05, "discount": 0.95, "buffer_size": int(1e5), "batch_size": 128, "learning_rate_actor": 0.0002, "learning_rate_critic": 0.0002, "hidden_sizes": [256, 256], 'tau': 0.0001 } self._config.update(userconfig) self._eps = self._config['eps'] self._tau = self._config['tau'] self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.eval_mode = False if self._config['lr_milestones'] is None: raise ValueError( 'lr_milestones argument cannot be None!\nExample: --lr_milestones=100 200 300' ) lr_milestones = [ int(x) for x in (self._config['lr_milestones'][0]).split(' ') ] # Critic self.critic = Critic( self._observation_dim, self._action_n, hidden_sizes=self._config['hidden_sizes'], learning_rate=self._config['learning_rate_critic'], lr_milestones=lr_milestones, lr_factor=self._config['lr_factor'], device=self._config['device']) self.critic_target = Critic( self._observation_dim, self._action_n, hidden_sizes=self._config['hidden_sizes'], learning_rate=self._config['learning_rate_critic'], lr_milestones=lr_milestones, lr_factor=self._config['lr_factor'], device=self._config['device']) # Actor self.actor = Actor(self._observation_dim, self._action_n, hidden_sizes=self._config['hidden_sizes'], learning_rate=self._config['learning_rate_actor'], lr_milestones=lr_milestones, lr_factor=self._config['lr_factor'], device=self._config['device']) self.actor_target = Actor( self._observation_dim, self._action_n, hidden_sizes=self._config['hidden_sizes'], learning_rate=self._config['learning_rate_actor'], lr_milestones=lr_milestones, lr_factor=self._config['lr_factor'], device=self._config['device']) def eval(self): self.eval_mode = True def train_mode(self): self.eval_mode = False def act(self, observation, eps=0, evaluation=False): state = torch.from_numpy(observation).float().to(self.device) if eps is None: eps = self._eps if np.random.random() > eps or evaluation: action = self.actor.forward(state) action = action.detach().cpu().numpy()[0] else: action = self._action_space.sample()[:4] return action def schedulers_step(self): self.critic.lr_scheduler.step() self.critic_target.lr_scheduler.step() self.actor.lr_scheduler.step() self.actor_target.lr_scheduler.step() def store_transition(self, transition): self.buffer.add_transition(transition) @staticmethod def load_model(fpath): with open(Path(fpath), 'rb') as inp: return pickle.load(inp) def train(self, total_step_counter, iter_fit=32): losses = [] for i in range(iter_fit): data = self.buffer.sample(batch_size=self._config['batch_size']) s = torch.FloatTensor(np.stack(data[:, 0])).to(self.device) s_next = torch.FloatTensor(np.stack(data[:, 3])).to(self.device) a = torch.FloatTensor(np.stack( data[:, 1])[:, None]).squeeze(dim=1).to(self.device) rew = torch.FloatTensor(np.stack( data[:, 2])[:, None]).squeeze(dim=1).to(self.device) done = torch.FloatTensor(np.stack( data[:, 4])[:, None]).squeeze(dim=1).to(self.device) # done flag Q_target = self.critic(s, a).squeeze(dim=1).to(self.device) a_next = self.actor_target.forward(s_next) Q_next = self.critic_target.forward( s_next, a_next).squeeze(dim=1).to(self.device) # target targets = rew + self._config['discount'] * Q_next * (1.0 - done) # optimize critic targets = targets.to(self.device) critic_loss = self.critic.loss(Q_target.float(), targets.float()) losses.append(critic_loss) self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() actions = self.actor.forward(s) actor_loss = -self.critic.forward(s, actions).mean() self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() # update if (total_step_counter) % self._config['update_target_every'] == 0: # optimize actor soft_update(self.critic_target, self.critic, self._tau) soft_update(self.actor_target, self.actor, self._tau) return losses
def retraining( save_path, network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=4, #50 nb_rollout_steps=3, #100 reward_scale=1.0, render=False, render_eval=False, # noise_type='adaptive-param_0.2', noise_type='normal_0.2', # noise_type='ou_0.9', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-4, # actor_lr=1e-6, # critic_lr=1e-5, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=3, # per epoch cycle and MPI worker, 50 nb_eval_steps=1, #100 batch_size=640, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=3, #50 **network_kwargs): if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() # nb_actions = env.action_space.shape[-1] nb_actions = env.num_actions # nb_actions=3 # print(nb_actions) action_shape = np.array(nb_actions * [0]).shape #4 pairs pos + 3 link length # nb_features = 2*(env.num_actions+1)+env.num_actions #4 pairs pos + 1 pair target pos nb_features = 2 * (env.num_actions + 2) observation_shape = np.array(nb_features * [0]).shape # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None # nb_actions = env.action_space.shape[-1] if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.initialize(sess) # sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar step_set = [] reward_set = [] epoch = 0 start_time = time.time() epoch_episode_rewards = [] mean_epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 #load the initialization policy agent.load_ini(sess, save_path) # agent.memory.clear(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) for epoch in range(nb_epochs): print(nb_epochs) # obs, env_state = env.reset() obs = env.reset() agent.save(save_path) epoch_episode_rewards = [] '''check if the actor initialization policy has been loaded correctly, i.e. equal to directly ouput values in checkpoint files ''' # loaded_weights=tf.get_default_graph().get_tensor_by_name('target_actor/mlp_fc0/w:0') # print('loaded_weights:', sess.run(loaded_weights)) for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) print('action:', action) new_obs, r, done = env.step(action) # time.sleep(0.2) t += 1 episode_reward += r episode_step += 1 # print('episode_re: ', episode_reward) #[1.] # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) b = 1. agent.store_transition( obs, action, r, new_obs, done ) #the batched data will be unrolled in memory.py's append. obs = new_obs epoch_episode_rewards.append(episode_reward) episode_reward = np.zeros(nenvs, dtype=np.float32) #vector # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) # print('Train!') cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards)) # print(step_set,mean_epoch_episode_rewards) step_set.append(t) plt.plot(step_set, mean_epoch_episode_rewards, color='r', label='Initialization') plt.xlabel('Steps') plt.ylabel('Mean Episode Reward') plt.savefig('ddpg_mean_retrain.png') # plt.show() # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) print('stepset: ', step_set) print('rewards: ', mean_epoch_episode_rewards) return agent
class DDPGAgent(object): """Interacts with and learns from the environment.""" def __init__(self, id, state_size, action_size, seed, memory, num_agents, hyperparameters: Mapping[str, float]): """Initialize a DDPG agent object. Params ====== id (int): agent's id state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed memory (ReplayBuffer): replay buffer to store the experience of this agent hyperparameters (dictionnary of str:): hyperparameters' values of the model. The expected parameters are: - batch_size (int): minibatch size - lr_actor (float): learning rate of the actor - lr_critic (float): learning rate of the critic - gamma (float): discount factor - weight_decay (float): critic L2 weight decay - tau (float): value for soft update of target parameters - update_frequency (int): how much steps must be executed before starting learn - n_learns (int): how many learning for update """ self.id = id self.__name__ = 'DDPG' self.state_size = state_size self.action_size = action_size self.gamma = hyperparameters['gamma'] self.batch_size = int(hyperparameters['batch_size']) self.tau = hyperparameters['tau'] self.update_frequency = int(hyperparameters['update_frequency']) self.n_learns = int(hyperparameters['n_learns']) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=hyperparameters['lr_actor']) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, num_agents, seed).to(device) self.critic_target = Critic(state_size, action_size, num_agents, seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=hyperparameters['lr_critic'], weight_decay=hyperparameters['weight_decay']) # Noise process self.noise = Ornstein(action_size) # Replay memory self.memory = memory # Initialize the time step (for every update_frequency steps) self.t_step = 0 def step(self, state, action, reward, next_state, done, other_states, other_actions, other_next_states): """Save experience in replay memory, and use random sample from buffer to learn.""" self.memory.add(state, action, reward, next_state, done, other_states, other_actions, other_next_states) self.t_step = (self.t_step + 1) % self.update_frequency if self.t_step == 0: # Learn, if enough samples are available in memory for _ in range(self.n_learns): if len(self.memory) > self.batch_size: experiences = self.memory.sample(self.batch_size) self.learn(experiences, self.gamma) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, _, _, _, _, other_states, _, _ = experiences self.update_critic(experiences, gamma) self.update_actor(states, other_states) self.update_target_networks() def update_critic(self, experiences, gamma): """Update the critic network given the experiences""" states, actions, rewards, next_states, dones, other_states, other_actions, other_next_states = experiences all_states = torch.cat((states, other_states), dim=1).to(device) all_actions = torch.cat((actions, other_actions), dim=1).to(device) all_next_states = torch.cat((next_states, other_next_states), dim=1).to(device) local_all_next_actions = [] local_all_next_actions.append(self.actor_target(states)) local_all_next_actions.append(self.actor_target(other_states)) all_next_actions = torch.cat(local_all_next_actions, dim=1).to(device) # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models Q_targets_next = self.critic_target(all_next_states, all_next_actions) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(all_states, all_actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() def update_actor(self, states, other_states): all_states = torch.cat((states, other_states), dim=1).to(device) # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) other_actions_pred = self.actor_local(other_states) other_actions_pred = other_actions_pred.detach() actions_pred = torch.cat((actions_pred, other_actions_pred), dim=1).to(device) actor_loss = -self.critic_local(all_states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() def update_target_networks(self): # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def testing( save_path, network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=50, nb_rollout_steps=3, reward_scale=1.0, render=False, render_eval=False, # no noise for test # noise_type='adaptive-param_0.2', # noise_type='normal_0.9', # noise_type='ou_0.9', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, # actor_lr=1e-6, # critic_lr=1e-5, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=3, # per epoch cycle and MPI worker, 50 nb_eval_steps=1, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=3, # **network_kwargs): if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() # nb_actions = env.action_space.shape[-1] # nb_actions = 2*env.grid_size nb_actions = env.grid_size action_shape = np.array(nb_actions * [0]).shape nb_features = (4 + 1) * env.grid_size observation_shape = np.array(nb_features * [0]).shape grid_x = env.grid_x grid_y = env.grid_y x = [] y = [] for i in range(grid_x): x.append(i + 1) for i in range(grid_y): y.append(i + 1) # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None '''no noise for test''' # if noise_type is not None: # for current_noise_type in noise_type.split(','): # current_noise_type = current_noise_type.strip() # if current_noise_type == 'none': # pass # elif 'adaptive-param' in current_noise_type: # _, stddev = current_noise_type.split('_') # param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) # elif 'normal' in current_noise_type: # _, stddev = current_noise_type.split('_') # action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # elif 'ou' in current_noise_type: # _, stddev = current_noise_type.split('_') # action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # else: # raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # max_action = env.action_space.high # logger.info('scaling actions by {} before executing in env'.format(max_action)) # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. # agent.initialize(sess) # sess.graph.finalize() agent.load(sess, save_path) agent.reset() obs, env_state = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar step_set = [] reward_set = [] epoch = 0 start_time = time.time() epoch_episode_rewards = [] average_reward = [] mean_epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_state = [] epoch_episodes = 0 #record the car numbers in each step car_num_set = {} t_set = [i for i in range(total_timesteps)] for xx in x: for yy in y: lab = str(xx) + str(yy) car_num_set[lab] = [[0 for i in range(total_timesteps)] for j in range(4)] for epoch in range(nb_epochs): obs, env_state = env.reset() epoch_actions = [] epoch_state = [] average_car_num_set = [] last_action = 1 for cycle in range(nb_epoch_cycles): # Perform rollouts. action, q, _, _ = agent.step(obs, apply_noise=False, compute_Q=True) '''random action''' # if np.random.rand()>0.5: # action=[1] # else: # action=[0] '''cycle light state''' # action=[0] '''cycle action (should cycle state instead of action)''' # if last_action==1: # action=[0] # else: # action=[1] # last_action=action[0] if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): new_obs, r, env_state, done = env.step(action, env_state) epoch_state.append(env_state['11'].light_state) for xx in x: for yy in y: lab = str(xx) + str(yy) for i in range(4): car_num_set[lab][i][t] = ( env_state['11'].car_nums[i]) t += 1 episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) b = 1. agent.store_transition( obs, action, r, new_obs, done ) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: print('done') # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() epoch_episode_rewards.append(episode_reward) average_reward.append(episode_reward / nb_rollout_steps) episode_reward = np.zeros(nenvs, dtype=np.float32) #vector # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] # for t_train in range(nb_train_steps): # # Adapt param noise, if necessary. # if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: # distance = agent.adapt_param_noise() # epoch_adaptive_distances.append(distance) # # print('Train!') # cl, al = agent.train() # epoch_critic_losses.append(cl) # epoch_actor_losses.append(al) # agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 step_set.append(t) mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards)) # print(step_set,mean_epoch_episode_rewards) # plt.figure(figsize=(8,5)) '''plot rewards-steps''' ax1 = plt.subplot(2, 1, 1) plt.sca(ax1) plt.plot(step_set, average_reward, color='b') # plt.xlabel('Steps') plt.ylabel('Mean Reward', fontsize=12) # plt.ylim(-15000,0) '''plot queueing car numbers-steps''' ax2 = plt.subplot(2, 1, 2) plt.sca(ax2) print(np.shape(t_set), np.shape(car_num_set['11'][i])) for i in range(4): if i == 0: plt.plot(t_set, car_num_set['11'][i], '--', label=i, color='b') elif i == 1: plt.plot(t_set, car_num_set['11'][i], '--', label=i, color='orange') elif i == 2: plt.plot(t_set, car_num_set['11'][i], label=i, color='g') else: plt.plot(t_set, car_num_set['11'][i], label=i, color='r') plt.ylim(0, 100) #sum among roads sum_car_num = np.sum(car_num_set['11'], axis=0) #average among time steps average_car_num = np.average(sum_car_num) average_car_num_set.append(average_car_num) plt.xlabel('Steps', fontsize=12) plt.ylabel('Cars Numbers', fontsize=12) # set legend handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) leg = plt.legend(by_label.values(), by_label.keys(), loc=1) # leg = plt.legend(loc=4) legfm = leg.get_frame() legfm.set_edgecolor('black') # set legend fame color legfm.set_linewidth(0.5) # set legend fame linewidth plt.savefig('ddpg_mean_test.pdf') plt.show() print(epoch_state) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) print('average queueing car numbers: ', np.average(average_car_num_set)) return agent
def testing(save_path, network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=50, nb_rollout_steps=3, #100 reward_scale=1.0, render=False, render_eval=False, # no noise for test # noise_type='adaptive-param_0.2', # noise_type='normal_0.9', # noise_type='ou_0.9', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, # actor_lr=1e-6, # critic_lr=1e-5, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=3, # per epoch cycle and MPI worker, 50 nb_eval_steps=1, #100 batch_size=640, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=3, #50 **network_kwargs): if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() # nb_actions = env.action_space.shape[-1] nb_actions = env.num_actions # nb_actions=3 # print(nb_actions) action_shape=np.array(nb_actions*[0]).shape nb_features = 2*(env.num_actions+1)+env.num_actions observation_shape=np.array(nb_features*[0]).shape # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None # nb_actions = env.action_space.shape[-1] '''no noise for test''' # if noise_type is not None: # for current_noise_type in noise_type.split(','): # current_noise_type = current_noise_type.strip() # if current_noise_type == 'none': # pass # elif 'adaptive-param' in current_noise_type: # _, stddev = current_noise_type.split('_') # param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) # elif 'normal' in current_noise_type: # _, stddev = current_noise_type.split('_') # action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # elif 'ou' in current_noise_type: # _, stddev = current_noise_type.split('_') # action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # else: # raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # max_action = env.action_space.high # logger.info('scaling actions by {} before executing in env'.format(max_action)) # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.load(sess,save_path) # sess.graph.finalize() # cannot save sess if its finalized! agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype = np.float32) #vector episode_step = np.zeros(nenvs, dtype = int) # vector episodes = 0 #scalar t = 0 # scalar step_set=[] reward_set=[] epoch = 0 start_time = time.time() epoch_episode_rewards = [] mean_epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): print(nb_epochs) # obs, env_state = env.reset() obs = env.reset() for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. '''no noise for test''' action, q, _, _ = agent.step(obs, apply_noise=False, compute_Q=True) # print('action:', action) # Execute next action. # if rank == 0 and render: # env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch # new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # new_obs, r, env_state,done = env.step(action, env_state) '''actually no need for env_state: in or out''' new_obs, r, done = env.step(action) # print('reward:', r) # note these outputs are batched from vecenv # print('obs: ',obs.shape,obs, 'action: ', action.shape, action ) '''obs shape: (1,17), action shape: (1,6)''' # print('maxaction: ', max_action.shape) '''max_action shape: (6,) , max_action*action shape: (1,6)''' t += 1 # if rank == 0 and render: # env.render() # print('r:', r) episode_reward += r episode_step += 1 # print('episode_re: ', episode_reward) #[1.] # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) b=1. agent.store_transition(obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append. # print('r: ', r) # '''r shape: (1,)''' obs = new_obs # for d in range(len(done)): # if done[d]: # print('done') # # Episode done. # epoch_episode_rewards.append(episode_reward[d]) # episode_rewards_history.append(episode_reward[d]) # epoch_episode_steps.append(episode_step[d]) # episode_reward[d] = 0. # episode_step[d] = 0 # epoch_episodes += 1 # episodes += 1 # if nenvs == 1: # agent.reset() '''added''' epoch_episode_rewards.append(episode_reward) ''' step_set.append(t) reward_set=np.concatenate((reward_set,episode_reward)) # print(step_set,reward_set) # print(t, episode_reward) plt.plot(step_set,reward_set) plt.xlabel('Steps') plt.ylabel('Episode Reward') plt.savefig('ddpg.png') plt.show() ''' episode_reward = np.zeros(nenvs, dtype = np.float32) #vector # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] '''no training for test''' # for t_train in range(nb_train_steps): # Adapt param noise, if necessary. no noise for test! # if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: # distance = agent.adapt_param_noise() # epoch_adaptive_distances.append(distance) # cl, al = agent.train() # epoch_critic_losses.append(cl) # epoch_actor_losses.append(al) # agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype = np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append(eval_episode_reward[d]) eval_episode_reward[d] = 0.0 mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards)) # print(step_set,mean_epoch_episode_rewards) step_set.append(t) plt.plot(step_set,mean_epoch_episode_rewards) plt.xlabel('Steps') plt.ylabel('Mean Episode Reward') plt.savefig('ddpg_mean_test.png') # plt.show() # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([ np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) return agent
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) # ---------- AMEND: specific setting for brsEngine ----------- print("kwargs", kwargs) env.reward_type = kwargs['reward_type'] env.set_additional_goal = kwargs['set_additional_goal'] kwargs.pop('reward_type', None) kwargs.pop('set_additional_goal', None) brsEngine = None if env.reward_type == 'ttr': if env_id == 'DubinsCarEnv-v0': brsEngine = DubinsCar_brs_engine() brsEngine.reset_variables() elif env_id == 'PlanarQuadEnv-v0': brsEngine = Quadrotor_brs_engine() brsEngine.reset_variables() else: raise ValueError("invalid environment name for ttr reward!") # You have to assign the engine! env.brsEngine = brsEngine # ----------------------------------------------------------- env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) # ---------- AMEND: specific setting for brsEngine ----------- eval_env.brsEngine = brsEngine # ------------------------------------------------------------ eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def __init__(self, observation_shape, action_shape, nb_demo_kine, nb_key_states, batch_size=128, noise_type='', actor=None, critic=None, layer_norm=True, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), normalize_returns=False, normalize_observations=True, reward_scale=1., clip_norm=None, demo_l2_reg=0., critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, demo_lr=5e-3, gamma=0.99, tau=0.001, enable_popart=False, save_ckpt=True): # Noise nb_actions = action_shape[-1] param_noise, action_noise = process_noise_type(noise_type, nb_actions) logger.info('param_noise', param_noise) logger.info('action_noise', action_noise) # States recording self.memory = Memory(limit=int(2e5), action_shape=action_shape, observation_shape=observation_shape) # Models self.nb_demo_kine = nb_demo_kine self.actor = actor or Actor( nb_actions, nb_demo_kine, layer_norm=layer_norm) self.nb_key_states = nb_key_states self.critic = critic or Critic(nb_key_states, layer_norm=layer_norm) self.nb_obs_org = nb_key_states # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') # self.critic_target_Q: value assigned by self.target_Q_obs0 self.critic_target_Q = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target_Q') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # change in observations self.obs_delta_kine = (self.obs1 - self.obs0)[:, :self.nb_demo_kine] self.obs_delta_kstates = (self.obs1 - self.obs0)[:, :self.nb_key_states] # Parameters. self.gamma = gamma self.tau = tau self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.actor_lr = actor_lr self.critic_lr = critic_lr self.demo_lr = demo_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.demo_l2_reg = demo_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None self.normalized_obs0 = tf.clip_by_value( obs_norm_partial(self.obs0, self.obs_rms, self.nb_obs_org), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value( obs_norm_partial(self.obs1, self.obs_rms, self.nb_obs_org), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(self.actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(self.critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across set-up parts. # the actor output is [0,1], need to normalised to [-1,1] before feeding into critic self.actor_tf, self.demo_aprx = self.actor(self.normalized_obs0) # critic loss # normalized_critic_tf, pred_rwd, pred_obs_delta: critic_loss self.normalized_critic_tf, self.pred_rwd, self.pred_obs_delta = self.critic( self.normalized_obs0, act_norm(self.actions)) # self.critic_tf: only in logging [reference_Q_mean/std] self.critic_tf = ret_denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # actor loss normalized_critic_with_actor_tf = self.critic(self.normalized_obs0, act_norm(self.actor_tf), reuse=True)[0] # self.critic_with_actor_tf: actor loss, and logging [reference_Q_tf_mean/std] self.critic_with_actor_tf = ret_denormalize( tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # target Q self.target_action = tf.clip_by_value( target_actor(normalized_obs1)[0], self.action_range[0], self.action_range[1]) self.target_Q_obs1 = ret_denormalize( target_critic(normalized_obs1, act_norm(self.target_action))[0], self.ret_rms) self.target_Q_obs0 = self.rewards + ( 1. - self.terminals1) * gamma * self.target_Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(self.normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.dbg_vars = self.actor.dbg_vars + self.critic.dbg_vars self.sess = None # Set up checkpoint saver self.save_ckpt = save_ckpt if save_ckpt: self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=20) else: # saver for loading ckpt self.saver = tf.train.Saver() self.main_summaries = tf.summary.merge_all() logdir = logger.get_dir() if logdir: self.train_writer = tf.summary.FileWriter( os.path.join(logdir, 'tb'), tf.get_default_graph()) else: self.train_writer = None
def learn( save_path, network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=7, #50 nb_rollout_steps=3, #100 reward_scale=1.0, render=False, render_eval=False, # noise_type='adaptive-param_0.2', # noise_type='normal_0.2', # large noise # noise_type='normal_0.02', # small noise noise_type='normal_2.0', # action ranges 360, so noise scale should be chosen properly # noise_type='normal_5', # large noise # noise_type='normal_0.2', # small noise # noise_type='normal_0.00001', # no noise # noise_type='ou_0.9', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, # large lr critic_lr=1e-3, # large lr # actor_lr=1e-7, # small lr # critic_lr=1e-3, # small lr # actor_lr = 1e-10, # no lr # critic_lr=1e-10, # no lr popart=False, gamma=0.99, clip_norm=None, nb_train_steps=3, # per epoch cycle and MPI worker, 50 nb_eval_steps=1, #100 batch_size=640, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=3, #50 **network_kwargs): if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() nb_actions = env.num_actions action_shape = np.array(nb_actions * [0]).shape #4 pairs pos + 3 link length # nb_features = 2*(env.num_actions+1)+env.num_actions #4 pairs pos + 1 pair target pos nb_features = 2 * (env.num_actions + 2) observation_shape = np.array(nb_features * [0]).shape # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None # nb_actions = env.action_space.shape[-1] if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # max_action = env.action_space.high # logger.info('scaling actions by {} before executing in env'.format(max_action)) # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.initialize(sess) # sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar step_set = [] reward_set = [] epoch = 0 start_time = time.time() epoch_episode_rewards = [] mean_epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] episode_end_distance = [] epoch_episodes = 0 SPARSE_REWARD = False '''add this line to make non-initialized to be initialized''' agent.load_ini(sess, save_path) for epoch in range(nb_epochs): print('epochs: ', epoch) obs = env.reset() agent.save(save_path) epoch_episode_rewards = [] for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) # print('action:', action) if SPARSE_REWARD: new_obs, r, done, end_distance = env.step( action, SPARSE_REWARD) else: new_obs, r, done = env.step(action, SPARSE_REWARD) t += 1 episode_reward += r episode_step += 1 # print('episode_re: ', episode_reward) #[1.] # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) b = 1. agent.store_transition( obs, action, r, new_obs, done ) #the batched data will be unrolled in memory.py's append. # print('r: ', r) # '''r shape: (1,)''' obs = new_obs epoch_episode_rewards.append(episode_reward) if cycle == nb_epoch_cycles - 1: # record the distance from the end position of reacher to the goal for the last step of each episode if SPARSE_REWARD: episode_end_distance.append(end_distance) else: end_distance = 100.0 / r - 1 episode_end_distance.append(end_distance[0]) episode_reward = np.zeros(nenvs, dtype=np.float32) #vector # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] # filling memory with noised initialized policy & preupdate the critic networks preheating_step = 30 #50 episode = 600 steps, 12 steps per episode if epoch > preheating_step: # print('memory_entries: ',memory.nb_entries) for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) # print('Train!') cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() else: # update two critic networks at start cl = agent.update_critic() epoch_critic_losses.append(cl) print('critic loss in initial training: ', cl) pass # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards)) # print(step_set,mean_epoch_episode_rewards) step_set.append(t) plt.figure(1) plt.plot(step_set, mean_epoch_episode_rewards) plt.xlabel('Steps') plt.ylabel('Mean Episode Reward') plt.savefig('ddpg_mean.png') plt.figure(2) plt.plot(step_set, episode_end_distance) plt.xlabel('Steps') plt.ylabel('Distance to Target') plt.savefig('ddpgini_distance.png') # plt.show() # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) print('stepset: ', step_set) print('rewards: ', mean_epoch_episode_rewards) print('distances: ', episode_end_distance) return agent
class DDPGAgent: def __init__(self, env, gamma, tau, buffer_maxlen, critic_learning_rate, actor_learning_rate, max_action=1): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.noise = OUNoise(env.action_space) self.iter = 0.0 self.noisy = False self.max_action = max_action print(self.action_dim) print(self.obs_dim) # RL hyperparameters self.env = env self.gamma = gamma self.tau = tau # Initialize critic and actorr networks self.critic = Critic(self.obs_dim, self.action_dim).to(self.device) self.critic_target = Critic(self.obs_dim, self.action_dim).to(self.device) self.actor = Actor(self.obs_dim, self.action_dim, self.max_action).to(self.device) self.actor_target = Actor(self.obs_dim, self.action_dim).to(self.device) # Copy target network paramters for critic for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) # Set Optimization algorithms self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate) self.replay_buffer = ExperienceReplayLog(buffer_maxlen) def get_action(self, obs): #print('obs;',obs) if self.noisy == True: state = torch.FloatTensor(obs).unsqueeze(0).to(self.device) action = self.actor.forward(state) action = action.squeeze(0).cpu().detach().numpy() action = self.noise.get_action(action, self.iter) self.iter = self.iter + 1 else: state = torch.FloatTensor(obs).unsqueeze(0).to(self.device) action = self.actor.forward(state) action = action.squeeze(0).cpu().detach().numpy() return action def update(self, batch_size): #Batch updates states, actions, rewards, next_states, _ = self.replay_buffer.sample( batch_size) state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample( batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) masks = torch.FloatTensor(masks).to(self.device) # Q info updates curr_Q = self.critic.forward(state_batch, action_batch) next_actions = self.actor_target.forward(next_state_batch) next_Q = self.critic_target.forward(next_state_batch, next_actions.detach()) expected_Q = reward_batch + self.gamma * next_Q # Update Critic network q_loss = F.mse_loss(curr_Q, expected_Q.detach()) self.critic_optimizer.zero_grad() q_loss.backward() self.critic_optimizer.step() # Update Actor network policy_loss = -self.critic.forward( state_batch, self.actor.forward(state_batch)).mean() self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() # Update Actor and Critic target networks for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))