def __init__(self, state_space, action_space, buffer_size, batch_size,learning_rate_actor, learning_rate_critic,update_rate, gamma, tau, device, seed, num_agents, epsilon, epsilon_decay, epsilon_min): self.num_agents = num_agents self.action_space = action_space self.state_space = state_space self.buffer_size = buffer_size self.batch_size = batch_size self.step_count = 0. self.update_rate = update_rate self.tau = tau self.seed = seed self.device= device self.gamma = gamma self.actor_local_network = ActorNetwork(state_space, action_space, device, seed).to(device) self.actor_target_network = ActorNetwork(state_space, action_space, device, seed).to(device) self.critic_local_network = CriticNetwork(state_space, action_space, device, seed).to(device) self.critic_target_network = CriticNetwork(state_space, action_space, device, seed).to(device) self.actor_optimizer = torch.optim.Adam(self.actor_local_network.parameters(), lr=learning_rate_actor) self.critic_optimizer = torch.optim.Adam(self.critic_local_network.parameters(), lr=learning_rate_critic) self.noise = OUNoise(action_space, seed) self.memory = ReplayBuffer(buffer_size = self.buffer_size, batch_size=self.batch_size, device=device, seed=seed) self.epsilon = epsilon self.epsilon_decay = epsilon_decay self.epsilon_min = epsilon_min
def __init__(self, env_id, action_space, trajectory_size=256, n_envs=1, max_timesteps=1500): self.env_id = env_id self.n_envs = n_envs self.trajectory_size = trajectory_size self.vecenv = VecEnv(env_id=self.env_id, n_envs=self.n_envs, max_timesteps=max_timesteps) self.policy = PolicyNetwork(action_space=action_space) self.old_policy = PolicyNetwork(action_space=action_space) self.critic = CriticNetwork() self.r_running_stats = util.RunningStats(shape=(action_space, )) self._init_network()
def __init__(self, state_dim, action_dim, lr_actor=1e-4, lr_critic=1e-4, lr_decay=.95, replay_buff_size=10000, gamma=.99, batch_size=128, random_seed=42, soft_update_tau=1e-3, actor_layer_dim_1=128, actor_layer_dim_2=128, actor_layer_dim_3=0, critic_layer_dim_1=128, critic_layer_dim_2=64, critic_layer_dim_3=0): """ Initialize model """ self.lr_actor = lr_actor self.gamma = gamma self.lr_critic = lr_critic self.lr_decay = lr_decay self.tau = soft_update_tau self.actor_local = ActorNetwork(state_dim, action_dim, actor_layer_dim_1, actor_layer_dim_2, actor_layer_dim_3).to(device=device) self.actor_target = ActorNetwork(state_dim, action_dim, actor_layer_dim_1, actor_layer_dim_2, actor_layer_dim_3).to(device=device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) self.critic_local = CriticNetwork(state_dim, action_dim, critic_layer_dim_1, critic_layer_dim_2, critic_layer_dim_3).to(device=device) self.critic_target = CriticNetwork( state_dim, action_dim, critic_layer_dim_1, critic_layer_dim_2, critic_layer_dim_3).to(device=device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic) self.noise = OUNoise(action_dim, random_seed) self.memory = ReplayBuffer(action_dim, replay_buff_size, batch_size, random_seed) self.path = ""
def __init__(self, state_size, action_size,hd1_units=400, hd2_units=300 ,random_seed = 0, buffer_size = int(2e5), batch_size = 256, tau = 0.0005, actorLr =1e-3, criticLr = 1e-3, weight_decay = 0, update_every = 20, gamma = 0.99): """ :state_size (int): dimension of each state :action_size (int): dimension of each action :hd1_units (int) : number of the first hidden layer units :hd1_units (int) : number of the second hidden layer units :random_seed (int): random seed :buffer_size (int): replay buffer size :batch_size (int): batch size :tau (float): interpolation factor :actorLr (float): actor learning rate :criticLr (float): critic learning rate :weight_decay (float): Optimizer L2 penalty :update_every (int): learning frequency :gamma (float): Discount factor """ self.state_size = state_size self.action_size = action_size self.update_every = update_every self.gamma = gamma self.tau = tau random.seed(random_seed) # Actor & Target Networks self.actor_local = ActorNetwork(state_size, action_size, random_seed, hd1_units, hd2_units).to(device) self.actor_target = ActorNetwork(state_size, action_size, random_seed, hd1_units, hd2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=actorLr, weight_decay = weight_decay) # Critic & Target Networks self.critic_local = CriticNetwork(state_size, action_size, random_seed, 400, 300).to(device) self.critic_target = CriticNetwork(state_size, action_size, random_seed, 400, 300).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=criticLr, weight_decay=weight_decay) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed) self.t_step = 0
def __init__(self): self.env = gym.make(self.ENV_ID) self.env.max_episode_steps = 3000 self.actor = ActorNetwork(action_space=self.ACTION_SPACE, max_action=self.MAX_ACTION) self.target_actor = ActorNetwork(action_space=self.ACTION_SPACE, max_action=self.MAX_ACTION) self.critic = CriticNetwork() self.target_critic = CriticNetwork() self.buffer = ReplayBuffer(max_experiences=self.MAX_EXPERIENCES) self.global_steps = 0 self.hiscore = None self._build_networks()
class TD3Agent: MAX_EXPERIENCES = 30000 MIN_EXPERIENCES = 300 ENV_ID = "Pendulum-v0" ACTION_SPACE = 1 MAX_ACTION = 2 OBSERVATION_SPACE = 3 CRITIC_UPDATE_PERIOD = 4 POLICY_UPDATE_PERIOD = 8 TAU = 0.02 GAMMA = 0.99 BATCH_SIZE = 64 NOISE_STDDEV = 0.2 def __init__(self): self.env = gym.make(self.ENV_ID) self.env.max_episode_steps = 3000 self.actor = ActorNetwork(action_space=self.ACTION_SPACE, max_action=self.MAX_ACTION) self.target_actor = ActorNetwork(action_space=self.ACTION_SPACE, max_action=self.MAX_ACTION) self.critic = CriticNetwork() self.target_critic = CriticNetwork() self.buffer = ReplayBuffer(max_experiences=self.MAX_EXPERIENCES) self.global_steps = 0 self.hiscore = None self._build_networks() def _build_networks(self): """パラメータの初期化 """ dummy_state = np.random.normal(0, 0.1, size=self.OBSERVATION_SPACE) dummy_state = (dummy_state[np.newaxis, ...]).astype(np.float32) dummy_action = np.random.normal(0, 0.1, size=self.ACTION_SPACE) dummy_action = (dummy_action[np.newaxis, ...]).astype(np.float32) self.actor.call(dummy_state) self.target_actor.call(dummy_state) self.target_actor.set_weights(self.actor.get_weights()) self.critic.call(dummy_state, dummy_action, training=False) self.target_critic.call(dummy_state, dummy_action, training=False) self.target_critic.set_weights(self.critic.get_weights()) def play(self, n_episodes): total_rewards = [] recent_scores = collections.deque(maxlen=10) for n in range(n_episodes): total_reward, localsteps = self.play_episode() total_rewards.append(total_reward) recent_scores.append(total_reward) recent_average_score = sum(recent_scores) / len(recent_scores) print(f"Episode {n}: {total_reward}") print(f"Local steps {localsteps}") print(f"Experiences {len(self.buffer)}") print(f"Global step {self.global_steps}") print(f"Noise stdev {self.NOISE_STDDEV}") print(f"recent average score {recent_average_score}") print() if (self.hiscore is None) or (recent_average_score > self.hiscore): self.hiscore = recent_average_score print(f"HISCORE Updated: {self.hiscore}") self.save_model() return total_rewards def play_episode(self): total_reward = 0 steps = 0 done = False state = self.env.reset() while not done: action = self.actor.sample_action(state, noise=self.NOISE_STDDEV) next_state, reward, done, _ = self.env.step(action) exp = Experience(state, action, reward, next_state, done) self.buffer.add_experience(exp) state = next_state total_reward += reward steps += 1 self.global_steps += 1 #: Delayed Policy update if self.global_steps % self.CRITIC_UPDATE_PERIOD == 0: if self.global_steps % self.POLICY_UPDATE_PERIOD == 0: self.update_network(self.BATCH_SIZE, update_policy=True) self.update_target_network() else: self.update_network(self.BATCH_SIZE) return total_reward, steps def update_network(self, batch_size, update_policy=False): if len(self.buffer) < self.MIN_EXPERIENCES: return (states, actions, rewards, next_states, dones) = self.buffer.get_minibatch(batch_size) clipped_noise = np.clip(np.random.normal(0, 0.2, self.ACTION_SPACE), -0.5, 0.5) next_actions = self.target_actor( next_states) + clipped_noise * self.MAX_ACTION q1, q2 = self.target_critic(next_states, next_actions) next_qvalues = [ min(q1, q2) for q1, q2 in zip(q1.numpy().flatten(), q2.numpy().flatten()) ] #: Compute taeget values and update CriticNetwork target_values = np.vstack([ reward + self.GAMMA * next_qvalue if not done else reward for reward, done, next_qvalue in zip(rewards, dones, next_qvalues) ]).astype(np.float32) #: Update Critic with tf.GradientTape() as tape: q1, q2 = self.critic(states, actions) loss1 = tf.reduce_mean(tf.square(target_values - q1)) loss2 = tf.reduce_mean(tf.square(target_values - q2)) loss = loss1 + loss2 variables = self.critic.trainable_variables gradients = tape.gradient(loss, variables) self.critic.optimizer.apply_gradients(zip(gradients, variables)) #: Delayed Update ActorNetwork if update_policy: with tf.GradientTape() as tape: q1, _ = self.critic(states, self.actor(states)) J = -1 * tf.reduce_mean(q1) variables = self.actor.trainable_variables gradients = tape.gradient(J, variables) self.actor.optimizer.apply_gradients(zip(gradients, variables)) def update_target_network(self): # soft-target update Actor target_actor_weights = self.target_actor.get_weights() actor_weights = self.actor.get_weights() assert len(target_actor_weights) == len(actor_weights) self.target_actor.set_weights((1 - self.TAU) * np.array(target_actor_weights) + (self.TAU) * np.array(actor_weights)) # soft-target update Critic target_critic_weights = self.target_critic.get_weights() critic_weights = self.critic.get_weights() assert len(target_critic_weights) == len(critic_weights) self.target_critic.set_weights((1 - self.TAU) * np.array(target_critic_weights) + (self.TAU) * np.array(critic_weights)) def save_model(self): self.actor.save_weights("checkpoints/actor") self.critic.save_weights("checkpoints/critic") def load_model(self): self.actor.load_weights("checkpoints/actor") self.target_actor.load_weights("checkpoints/actor") self.critic.load_weights("checkpoints/critic") self.target_critic.load_weights("checkpoints/critic") def test_play(self, n, monitordir, load_model=False): if load_model: self.load_model() if monitordir: env = wrappers.Monitor(gym.make(self.ENV_ID), monitordir, force=True, video_callable=(lambda ep: ep % 1 == 0)) else: env = gym.make(self.ENV_ID) for i in range(n): total_reward = 0 steps = 0 done = False state = env.reset() while not done: action = self.actor.sample_action(state, noise=False) next_state, reward, done, _ = env.step(action) state = next_state total_reward += reward steps += 1 print() print(f"Test Play {i}: {total_reward}") print(f"Steps:", steps) print()
def __init__(self, env, config, reporter=None): super(DdpgHer).__init__() self.env = env self.config = {**DdpgHer._default_config, **config} self.seed(self.config['seed']) a_space, obs_space = self.env.action_space, self.env.observation_space obs_size = obs_space.spaces['observation'].shape[0] goal_size = obs_space.spaces['desired_goal'].shape[0] self.env_params = get_env_params(self.env) self.reporter = reporter if self.config['cuda'] is None: self.config['cuda'] = torch.cuda.is_available() if self.config['cuda']: n_gpus = torch.cuda.device_count() assert n_gpus > 0 max_gpus = self.config['max_gpus'] if max_gpus is None: max_gpus = n_gpus n_gpus = min(n_gpus, max_gpus) n_workers = MPI.COMM_WORLD.size rank = MPI.COMM_WORLD.rank w_per_gpu = int(np.ceil(n_workers / n_gpus)) gpu_i = rank // w_per_gpu print(f'Worker with rank {rank} assigned GPU {gpu_i}.') torch.cuda.set_device(gpu_i) self.bc_loss = self.config.get('demo_file') is not None self.q_filter = self.config['q_filter'] # create the network self.actor_network = ActorNetwork( action_space=a_space, observation_space=obs_space, hidden_units=self.config['hidden_units']) self.critic_network = CriticNetwork( action_space=a_space, observation_space=obs_space, hidden_units=self.config['hidden_units']) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = ActorNetwork( action_space=a_space, observation_space=obs_space, hidden_units=self.config['hidden_units']) self.critic_target_network = CriticNetwork( action_space=a_space, observation_space=obs_space, hidden_units=self.config['hidden_units']) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.config['cuda']: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.config['lr_actor']) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.config['lr_critic']) # goal_space_bins should be of the form: # [dict(axis=0, box=np.linspace(0.0, 2.0, 15)), dict(axis=1, box=np.linspace(0.0, 2.0, 15)), ...] weight_her_sampling = False self._num_reached_goals_in_bin = None self._num_visited_goals_in_bin = None self._num_observed_goals_in_bin = None self._goal_space_bins = self.config['goal_space_bins'] if self._goal_space_bins is not None: weight_her_sampling = True self._num_reached_goals_in_bin = np.zeros( tuple(1 + b['box'].size for b in self._goal_space_bins)) self._num_visited_goals_in_bin = self._num_reached_goals_in_bin.copy( ) self._num_observed_goals_in_bin = self._num_reached_goals_in_bin.copy( ) # her sampler self.her_module = HerSampler( self.config['replay_strategy'], self.config['replay_k'], self.env.compute_reward, weight_sampling=weight_her_sampling, archer_params=self.config['archer_params']) # create the normalizer self.o_norm = Normalizer(size=obs_size, default_clip_range=self.config['clip_range']) self.g_norm = Normalizer(size=goal_size, default_clip_range=self.config['clip_range']) # create the replay and demo buffers self.buffer = ReplayBuffer(self.env_params, self.config['buffer_size'], self.her_module.sample_her_transitions) self.demo_buffer = None if self.bc_loss: self._init_demo_buffer(update_stats=True) self._trained = False
class DdpgHer(object): _default_config = { 'n_epochs': 50, 'n_cycles': 50, 'n_batches': 40, 'checkpoint_freq': 5, 'seed': 123, 'num_workers': 1, 'replay_strategy': 'future', 'clip_return': 50., 'noise_eps': 0.2, 'random_eps': 0.3, 'buffer_size': int(1e6), 'replay_k': 4, 'clip_obs': 200., 'batch_size': 256, 'hidden_units': 256, 'gamma': 0.98, 'action_l2': 1., 'lr_actor': 0.001, 'lr_critic': 0.001, 'polyak': 0.95, 'n_test_rollouts': 10, 'clip_range': 5., 'demo_length': 20, 'local_dir': None, 'cuda': None, 'max_gpus': None, 'rollouts_per_worker': 2, 'goal_space_bins': None, 'archer_params': None, 'q_filter': False, 'prm_loss_weight': 0.001, 'aux_loss_weight': 0.0078, 'demo_batch_size': None, 'demo_file': None, 'num_demo': 100, } def __init__(self, env, config, reporter=None): super(DdpgHer).__init__() self.env = env self.config = {**DdpgHer._default_config, **config} self.seed(self.config['seed']) a_space, obs_space = self.env.action_space, self.env.observation_space obs_size = obs_space.spaces['observation'].shape[0] goal_size = obs_space.spaces['desired_goal'].shape[0] self.env_params = get_env_params(self.env) self.reporter = reporter if self.config['cuda'] is None: self.config['cuda'] = torch.cuda.is_available() if self.config['cuda']: n_gpus = torch.cuda.device_count() assert n_gpus > 0 max_gpus = self.config['max_gpus'] if max_gpus is None: max_gpus = n_gpus n_gpus = min(n_gpus, max_gpus) n_workers = MPI.COMM_WORLD.size rank = MPI.COMM_WORLD.rank w_per_gpu = int(np.ceil(n_workers / n_gpus)) gpu_i = rank // w_per_gpu print(f'Worker with rank {rank} assigned GPU {gpu_i}.') torch.cuda.set_device(gpu_i) self.bc_loss = self.config.get('demo_file') is not None self.q_filter = self.config['q_filter'] # create the network self.actor_network = ActorNetwork( action_space=a_space, observation_space=obs_space, hidden_units=self.config['hidden_units']) self.critic_network = CriticNetwork( action_space=a_space, observation_space=obs_space, hidden_units=self.config['hidden_units']) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = ActorNetwork( action_space=a_space, observation_space=obs_space, hidden_units=self.config['hidden_units']) self.critic_target_network = CriticNetwork( action_space=a_space, observation_space=obs_space, hidden_units=self.config['hidden_units']) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.config['cuda']: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.config['lr_actor']) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.config['lr_critic']) # goal_space_bins should be of the form: # [dict(axis=0, box=np.linspace(0.0, 2.0, 15)), dict(axis=1, box=np.linspace(0.0, 2.0, 15)), ...] weight_her_sampling = False self._num_reached_goals_in_bin = None self._num_visited_goals_in_bin = None self._num_observed_goals_in_bin = None self._goal_space_bins = self.config['goal_space_bins'] if self._goal_space_bins is not None: weight_her_sampling = True self._num_reached_goals_in_bin = np.zeros( tuple(1 + b['box'].size for b in self._goal_space_bins)) self._num_visited_goals_in_bin = self._num_reached_goals_in_bin.copy( ) self._num_observed_goals_in_bin = self._num_reached_goals_in_bin.copy( ) # her sampler self.her_module = HerSampler( self.config['replay_strategy'], self.config['replay_k'], self.env.compute_reward, weight_sampling=weight_her_sampling, archer_params=self.config['archer_params']) # create the normalizer self.o_norm = Normalizer(size=obs_size, default_clip_range=self.config['clip_range']) self.g_norm = Normalizer(size=goal_size, default_clip_range=self.config['clip_range']) # create the replay and demo buffers self.buffer = ReplayBuffer(self.env_params, self.config['buffer_size'], self.her_module.sample_her_transitions) self.demo_buffer = None if self.bc_loss: self._init_demo_buffer(update_stats=True) self._trained = False def _bin_idx_for_goals(self, goals: np.ndarray): assert self._goal_space_bins is not None return tuple( np.digitize(goals[..., b['axis']], b['box'], right=False) for b in self._goal_space_bins) def _get_info_for_goals(self, goals: np.ndarray): assert self._goal_space_bins is not None idx = self._bin_idx_for_goals(goals) times_success = self._num_reached_goals_in_bin[idx] times_visited = self._num_visited_goals_in_bin[idx] times_observed = self._num_observed_goals_in_bin[idx] tot_success = self._num_reached_goals_in_bin.sum() tot_visited = self._num_visited_goals_in_bin.sum() tot_observed = self._num_observed_goals_in_bin.sum() return ( times_success, tot_success, times_visited, tot_visited, times_observed, tot_observed, ) def seed(self, value): import random np.random.seed(value) random.seed(value) torch.manual_seed(value) self.env.seed(value) def _training_step(self): rollout_times = [] update_times = [] update_results = [] taken_steps = 0 failed_steps = 0 sampling_tot_time = 0.0 sampling_calls = 0 step_tic = datetime.now() for _ in range(self.config['n_cycles']): mb_obs, mb_ag, mb_g, mb_actions = [], [], [], [] while len(mb_obs) < self.config["rollouts_per_worker"]: tic = datetime.now() step_failure = False # reset the rollouts ep_obs, ep_ag, ep_g, ep_actions = [], [], [], [] # reset the environment observation = self.env.reset() obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] if self._goal_space_bins is not None: goal_idx = self._bin_idx_for_goals(g) self._num_observed_goals_in_bin[goal_idx] += 1 # start to collect samples for t in range(self.env_params['max_timesteps']): with torch.no_grad(): input_tensor = self._preproc_inputs(obs, g) pi = self.actor_network(input_tensor) action = self._select_actions(pi) try: observation_new, _, _, info = self.env.step(action) except MujocoException: step_failure = True break obs_new = observation_new['observation'] ag_new = observation_new['achieved_goal'] if self._goal_space_bins is not None: goal_idx = self._bin_idx_for_goals(ag_new) self._num_visited_goals_in_bin[goal_idx] += 1 if bool(info['is_success']): self._num_reached_goals_in_bin[goal_idx] += 1 # append rollouts ep_obs.append(obs.copy()) ep_ag.append(ag.copy()) ep_g.append(g.copy()) ep_actions.append(action.copy()) # re-assign the observation obs = obs_new ag = ag_new ep_obs.append(obs.copy()) ep_ag.append(ag.copy()) if step_failure: failed_steps += 1 continue taken_steps += self.env_params['max_timesteps'] mb_obs.append(ep_obs) mb_ag.append(ep_ag) mb_g.append(ep_g) mb_actions.append(ep_actions) rollout_times.append((datetime.now() - tic).total_seconds()) # convert them into arrays mb_obs = np.array(mb_obs) mb_ag = np.array(mb_ag) mb_g = np.array(mb_g) mb_actions = np.array(mb_actions) # store the episodes self.buffer.store_episode([mb_obs, mb_ag, mb_g, mb_actions]) self._update_normalizer([mb_obs, mb_ag, mb_g, mb_actions]) tic = datetime.now() # train the network for _ in range(self.config['n_batches']): # sample the episodes sampling_tic = datetime.now() sampled_transitions = self._sample_batch() sampling_tot_time += (datetime.now() - sampling_tic).total_seconds() sampling_calls += 1 res = self._update_network(sampled_transitions) update_results.append(res) # soft update self._soft_update_target_network(self.actor_target_network, self.actor_network) self._soft_update_target_network(self.critic_target_network, self.critic_network) update_times.append((datetime.now() - tic).total_seconds()) step_time = (datetime.now() - step_tic).total_seconds() tic = datetime.now() success_rate, avg_ep_reward = self._eval_agent() eval_time = (datetime.now() - tic).total_seconds() update_results_dict = dict() for k in update_results[0].keys(): update_results_dict['avg_' + k] = np.mean( [r[k] for r in update_results]) return { "test_success_rate": success_rate, "test_mean_ep_reward": avg_ep_reward, "avg_her_sampling_time": sampling_tot_time / sampling_calls, "avg_rollout_time": np.mean(rollout_times), "avg_network_update_time": np.mean(update_times), "evaluation_time": eval_time, "step_time": step_time, "env_steps": taken_steps, "failed_steps": failed_steps, **update_results_dict, } def _init_demo_buffer(self, update_stats=True): assert self.bc_loss file_path = self.config['demo_file'] num_demo = self.config['num_demo'] self.demo_buffer = ReplayBuffer(self.env_params, self.config['buffer_size'], self.her_module.sample_her_transitions) # data must be a dictionary of (at least) 4 lists; each list contains partial information for each episode. data = pickle.load(open(file_path, 'rb')) assert isinstance(data, dict) ordered_data = [] for k in ['mb_obs', 'mb_ag', 'mb_g', 'mb_actions']: mb_data = np.asarray(data[k]) assert len(mb_data) >= num_demo ordered_data.append(mb_data[:num_demo]) self.demo_buffer.store_episode(ordered_data) if update_stats: self._update_normalizer(ordered_data) def _sample_batch(self): batch_size = self.config['batch_size'] sample_kwargs = dict() if self._goal_space_bins is not None: sample_kwargs['get_info_for_goals'] = self._get_info_for_goals if self.bc_loss: demo_batch_size = self.config['demo_batch_size'] transitions = self.buffer.sample(batch_size - demo_batch_size, **sample_kwargs) transitions_demo = self.demo_buffer.sample(demo_batch_size) for k, values in transitions_demo.items(): rollout_vec = transitions[k].tolist() for v in values: rollout_vec.append(v.tolist()) transitions[k] = np.array(rollout_vec) else: transitions = self.buffer.sample(batch_size, **sample_kwargs) return transitions def save_checkpoint(self, epoch=0): local_dir = self.config.get('local_dir') if local_dir is not None: local_dir = local_dir + '/checkpoints' os.makedirs(local_dir, exist_ok=True) model_path = f'{local_dir}/model_{epoch}.pt' status_path = f'{local_dir}/status_{epoch}.pkl' torch.save([ self.o_norm.mean, self.o_norm.std, self.g_norm.mean, self.g_norm.std, self.actor_network.state_dict() ], model_path) with open(status_path, 'wb') as f: pickle.dump(dict(config=self.config), f) @staticmethod def load(env, local_dir, epoch=None): epoch = epoch or '*[0-9]' models = glob.glob(f'{local_dir}/model_{epoch}.pt') assert len(models) > 0, "No checkpoints found!" model_path = sorted(models, key=os.path.getmtime)[-1] epoch = model_path.split("_")[-1].split(".")[0] status_path = f'{local_dir}/status_{epoch}.pkl' with open(status_path, 'rb') as f: status = pickle.load(f) status['config']['cuda'] = torch.cuda.is_available() agent = DdpgHer(env, status['config']) agent._trained = True o_mean, o_std, g_mean, g_std, actor_state = torch.load( model_path, map_location=lambda storage, loc: storage) agent.o_norm.mean = o_mean agent.o_norm.std = o_std agent.g_norm.mean = g_mean agent.g_norm.std = g_std agent.actor_network.load_state_dict(actor_state) agent.actor_network.eval() print(f'Loaded model for epoch {epoch}.') return agent def predict(self, obs): if not self._trained: raise RuntimeError g = obs['desired_goal'] obs = obs['observation'] with torch.no_grad(): inputs = self._preproc_inputs(obs, g) pi = self.actor_network(inputs) action = pi.cpu().numpy().squeeze() return action def train(self): if self._trained: raise RuntimeError # make sure that different workers have different seeds # (from baselines' original implementation) local_uniform = np.random.uniform(size=(1, )) root_uniform = local_uniform.copy() MPI.COMM_WORLD.Bcast(root_uniform, root=0) if MPI.COMM_WORLD.Get_rank() != 0: assert local_uniform[0] != root_uniform[0] tic = datetime.now() n_epochs = self.config.get('n_epochs') saved_checkpoints = 0 total_env_steps = 0 for iter_i in it.count(): if n_epochs is not None and iter_i >= n_epochs: break res = self._training_step() total_env_steps += res['env_steps'] if MPI.COMM_WORLD.Get_rank() == 0: if (iter_i + 1) % self.config['checkpoint_freq'] == 0: self.save_checkpoint(epoch=(iter_i + 1)) saved_checkpoints += 1 if callable(self.reporter): self.reporter( **{ **res, "training_iteration": iter_i + 1, "total_time": (datetime.now() - tic).total_seconds(), "checkpoints": saved_checkpoints, "total_env_steps": total_env_steps, "current_buffer_size": self.buffer.current_size, }) # pre_process the inputs def _preproc_inputs(self, obs, g): obs_norm = self.o_norm.normalize(obs) g_norm = self.g_norm.normalize(g) # concatenate the stuffs inputs = np.concatenate([obs_norm, g_norm]) inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0) if self.config['cuda']: inputs = inputs.cuda() return inputs # this function will choose action for the agent and do the exploration def _select_actions(self, pi): action = pi.cpu().numpy().squeeze() # add the gaussian action += self.config['noise_eps'] * self.env_params[ 'action_max'] * np.random.randn(*action.shape) action = np.clip(action, -self.env_params['action_max'], self.env_params['action_max']) # random actions... random_actions = np.random.uniform(low=-self.env_params['action_max'], high=self.env_params['action_max'], size=self.env_params['action']) # choose if use the random actions action += np.random.binomial(1, self.config['random_eps'], 1)[0] * (random_actions - action) return action # update the normalizer def _update_normalizer(self, episode_batch): mb_obs, mb_ag, mb_g, mb_actions = episode_batch mb_obs_next = mb_obs[:, 1:, :] mb_ag_next = mb_ag[:, 1:, :] # get the number of normalization transitions num_transitions = mb_actions.shape[1] # create the new buffer to store them buffer_temp = { 'obs': mb_obs, 'ag': mb_ag, 'g': mb_g, 'actions': mb_actions, 'obs_next': mb_obs_next, 'ag_next': mb_ag_next, } transitions = self.her_module.sample_her_transitions( buffer_temp, num_transitions) obs, g = transitions['obs'], transitions['g'] # pre process the obs and g transitions['obs'], transitions['g'] = self._preproc_og(obs, g) # update self.o_norm.update(transitions['obs']) self.g_norm.update(transitions['g']) # recompute the stats self.o_norm.recompute_stats() self.g_norm.recompute_stats() def _preproc_og(self, o, g): o = np.clip(o, -self.config['clip_obs'], self.config['clip_obs']) g = np.clip(g, -self.config['clip_obs'], self.config['clip_obs']) return o, g # soft update def _soft_update_target_network(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_((1 - self.config['polyak']) * param.data + self.config['polyak'] * target_param.data) # update the network def _update_network(self, transitions): # pre-process the observation and goal o, o_next, g = transitions['obs'], transitions[ 'obs_next'], transitions['g'] transitions['obs'], transitions['g'] = self._preproc_og(o, g) transitions['obs_next'], transitions['g_next'] = self._preproc_og( o_next, g) # start to do the update obs_norm = self.o_norm.normalize(transitions['obs']) g_norm = self.g_norm.normalize(transitions['g']) inputs_norm = np.concatenate([obs_norm, g_norm], axis=1) obs_next_norm = self.o_norm.normalize(transitions['obs_next']) g_next_norm = self.g_norm.normalize(transitions['g_next']) inputs_next_norm = np.concatenate([obs_next_norm, g_next_norm], axis=1) # transfer them into the tensor inputs_norm_tensor = torch.tensor(inputs_norm, dtype=torch.float32) inputs_next_norm_tensor = torch.tensor(inputs_next_norm, dtype=torch.float32) actions_tensor = torch.tensor(transitions['actions'], dtype=torch.float32) r_tensor = torch.tensor(transitions['r'], dtype=torch.float32) if self.config['cuda']: inputs_norm_tensor = inputs_norm_tensor.cuda() inputs_next_norm_tensor = inputs_next_norm_tensor.cuda() actions_tensor = actions_tensor.cuda() r_tensor = r_tensor.cuda() # calculate the target Q value function with torch.no_grad(): # do the normalization # concatenate the stuffs actions_next = self.actor_target_network(inputs_next_norm_tensor) q_next_value = self.critic_target_network(inputs_next_norm_tensor, actions_next) q_next_value = q_next_value.detach() target_q_value = r_tensor + self.config['gamma'] * q_next_value target_q_value = target_q_value.detach() # clip the q value clip_return = 1 / (1 - self.config['gamma']) target_q_value = torch.clamp(target_q_value, -clip_return, 0) # the q loss real_q_value = self.critic_network(inputs_norm_tensor, actions_tensor) critic_loss = (target_q_value - real_q_value).pow(2).mean() # self.main.Q_tf ==> real_q_value # self.main.Q_pi_tf ==> self.critic_network(inputs_norm_tensor, actions_real) ==> approx_q_value # the actor loss action_l2 = self.config['action_l2'] actions_real = self.actor_network(inputs_norm_tensor) approx_q_value = self.critic_network(inputs_norm_tensor, actions_real) if self.bc_loss: # train with demonstrations using behavior cloning # choose only the demo buffer samples b_size = self.config['batch_size'] demo_b_size = self.config['demo_batch_size'] mask = np.concatenate( (np.zeros(b_size - demo_b_size), np.ones(demo_b_size)), axis=0) mask = torch.tensor(mask, dtype=torch.uint8, device=actions_real.device) if self.q_filter: # use Q-filter trick to perform BC only when needed with torch.no_grad(): mask &= (real_q_value > approx_q_value).squeeze() prm_loss_weight = self.config['prm_loss_weight'] cloning_loss = self.config['aux_loss_weight'] * ( actions_real[mask] - actions_tensor[mask]).pow(2).sum() else: # train without demonstrations prm_loss_weight = 1.0 cloning_loss = None actor_loss = -prm_loss_weight * approx_q_value.mean() actor_loss += prm_loss_weight * action_l2 * ( actions_real / self.env_params['action_max']).pow(2).mean() if cloning_loss is not None: actor_loss += cloning_loss # update actor network self.actor_optim.zero_grad() actor_loss.backward() sync_grads(self.actor_network) self.actor_optim.step() # update critic network self.critic_optim.zero_grad() critic_loss.backward() sync_grads(self.critic_network) self.critic_optim.step() res = dict(actor_loss=actor_loss.item(), critic_loss=critic_loss.item()) if cloning_loss is not None: res['cloning_loss'] = cloning_loss.item() return res # do the evaluation def _eval_agent(self): total_success_rate = [] ep_rewards = [] for _ in range(self.config['n_test_rollouts']): per_success_rate = [] ep_reward = 0.0 observation = self.env.reset() obs = observation['observation'] g = observation['desired_goal'] for _ in range(self.env_params['max_timesteps']): with torch.no_grad(): input_tensor = self._preproc_inputs(obs, g) pi = self.actor_network(input_tensor) # convert the actions actions = pi.detach().cpu().numpy().squeeze() observation_new, rew, _, info = self.env.step(actions) obs = observation_new['observation'] g = observation_new['desired_goal'] per_success_rate.append(info['is_success']) ep_reward += rew ep_rewards.append(ep_reward) total_success_rate.append(per_success_rate) total_success_rate = np.array(total_success_rate) local_success_rate = np.mean(total_success_rate[:, -1]) global_success_rate = MPI.COMM_WORLD.allreduce(local_success_rate, op=MPI.SUM) global_success_rate /= MPI.COMM_WORLD.Get_size() avg_ep_reward = np.array(ep_rewards).mean() global_avg_ep_reward = MPI.COMM_WORLD.allreduce(avg_ep_reward, op=MPI.SUM) global_avg_ep_reward /= MPI.COMM_WORLD.Get_size() return global_success_rate, global_avg_ep_reward
class PPOAgent: GAMMA = 0.99 GAE_LAMBDA = 0.95 CLIPRANGE = 0.2 OPT_ITER = 20 BATCH_SIZE = 2048 def __init__(self, env_id, action_space, trajectory_size=256, n_envs=1, max_timesteps=1500): self.env_id = env_id self.n_envs = n_envs self.trajectory_size = trajectory_size self.vecenv = VecEnv(env_id=self.env_id, n_envs=self.n_envs, max_timesteps=max_timesteps) self.policy = PolicyNetwork(action_space=action_space) self.old_policy = PolicyNetwork(action_space=action_space) self.critic = CriticNetwork() self.r_running_stats = util.RunningStats(shape=(action_space, )) self._init_network() def _init_network(self): env = gym.make(self.env_id) state = np.atleast_2d(env.reset()) self.policy(state) self.old_policy(state) def run(self, n_updates, logdir): self.summary_writer = tf.summary.create_file_writer(str(logdir)) history = {"steps": [], "scores": []} states = self.vecenv.reset() hiscore = None for epoch in range(n_updates): for _ in range(self.trajectory_size): actions = self.policy.sample_action(states) next_states = self.vecenv.step(actions) states = next_states trajectories = self.vecenv.get_trajectories() for trajectory in trajectories: self.r_running_stats.update(trajectory["r"]) trajectories = self.compute_advantage(trajectories) states, actions, advantages, vtargs = self.create_minibatch( trajectories) vloss = self.update_critic(states, vtargs) self.update_policy(states, actions, advantages) global_steps = (epoch + 1) * self.trajectory_size * self.n_envs train_scores = np.array([traj["r"].sum() for traj in trajectories]) if epoch % 1 == 0: test_scores, total_steps = self.play(n=1) test_scores, total_steps = np.array(test_scores), np.array( total_steps) history["steps"].append(global_steps) history["scores"].append(test_scores.mean()) ma_score = sum(history["scores"][-10:]) / 10 with self.summary_writer.as_default(): tf.summary.scalar("test_score", test_scores.mean(), step=epoch) tf.summary.scalar("test_steps", total_steps.mean(), step=epoch) print( f"Epoch {epoch}, {global_steps//1000}K, {test_scores.mean()}" ) if epoch // 10 > 10 and (hiscore is None or ma_score > hiscore): self.save_model() hiscore = ma_score print("Model Saved") with self.summary_writer.as_default(): tf.summary.scalar("value_loss", vloss, step=epoch) tf.summary.scalar("train_score", train_scores.mean(), step=epoch) return history def compute_advantage(self, trajectories): """ Generalized Advantage Estimation (GAE, 2016) """ for trajectory in trajectories: trajectory["v_pred"] = self.critic(trajectory["s"]).numpy() trajectory["v_pred_next"] = self.critic(trajectory["s2"]).numpy() is_nonterminals = 1 - trajectory["done"] normed_rewards = (trajectory["r"] / (np.sqrt(self.r_running_stats.var) + 1e-4)) deltas = normed_rewards + self.GAMMA * is_nonterminals * trajectory[ "v_pred_next"] - trajectory["v_pred"] advantages = np.zeros_like(deltas, dtype=np.float32) lastgae = 0 for i in reversed(range(len(deltas))): lastgae = deltas[ i] + self.GAMMA * self.GAE_LAMBDA * is_nonterminals[ i] * lastgae advantages[i] = lastgae trajectory["advantage"] = advantages trajectory["R"] = advantages + trajectory["v_pred"] return trajectories def update_policy(self, states, actions, advantages): self.old_policy.set_weights(self.policy.get_weights()) indices = np.random.choice(range(states.shape[0]), (self.OPT_ITER, self.BATCH_SIZE)) for i in range(self.OPT_ITER): idx = indices[i] old_means, old_stdevs = self.old_policy(states[idx]) old_logprob = self.compute_logprob(old_means, old_stdevs, actions[idx]) with tf.GradientTape() as tape: new_means, new_stdevs = self.policy(states[idx]) new_logprob = self.compute_logprob(new_means, new_stdevs, actions[idx]) ratio = tf.exp(new_logprob - old_logprob) ratio_clipped = tf.clip_by_value(ratio, 1 - self.CLIPRANGE, 1 + self.CLIPRANGE) loss_unclipped = ratio * advantages[idx] loss_clipped = ratio_clipped * advantages[idx] loss = tf.minimum(loss_unclipped, loss_clipped) loss = -1 * tf.reduce_mean(loss) grads = tape.gradient(loss, self.policy.trainable_variables) grads, _ = tf.clip_by_global_norm(grads, 0.5) self.policy.optimizer.apply_gradients( zip(grads, self.policy.trainable_variables)) def update_critic(self, states, v_targs): losses = [] indices = np.random.choice(range(states.shape[0]), (self.OPT_ITER, self.BATCH_SIZE)) for i in range(self.OPT_ITER): idx = indices[i] old_vpred = self.critic(states[idx]) with tf.GradientTape() as tape: vpred = self.critic(states[idx]) vpred_clipped = old_vpred + tf.clip_by_value( vpred - old_vpred, -self.CLIPRANGE, self.CLIPRANGE) loss = tf.maximum(tf.square(v_targs[idx] - vpred), tf.square(v_targs[idx] - vpred_clipped)) loss = tf.reduce_mean(loss) grads = tape.gradient(loss, self.critic.trainable_variables) grads, _ = tf.clip_by_global_norm(grads, 0.5) self.critic.optimizer.apply_gradients( zip(grads, self.critic.trainable_variables)) losses.append(loss) return np.array(losses).mean() @tf.function def compute_logprob(self, means, stdevs, actions): """ガウス分布の確率密度関数よりlogp(x)を計算 logp(x) = -0.5 log(2π) - log(std) -0.5 * ((x - mean) / std )^2 """ logprob = -0.5 * np.log(2 * np.pi) logprob += -tf.math.log(stdevs) logprob += -0.5 * tf.square((actions - means) / stdevs) logprob = tf.reduce_sum(logprob, axis=1, keepdims=True) return logprob def create_minibatch(self, trajectories): states = np.vstack([traj["s"] for traj in trajectories]) actions = np.vstack([traj["a"] for traj in trajectories]) advantages = np.vstack([traj["advantage"] for traj in trajectories]) v_targs = np.vstack([traj["R"] for traj in trajectories]) return states, actions, advantages, v_targs def save_model(self): self.policy.save_weights("checkpoints/policy") self.critic.save_weights("checkpoints/critic") def load_model(self): self.policy.load_weights("checkpoints/policy") self.critic.load_weights("checkpoints/critic") def play(self, n=1, monitordir=None, verbose=False): if monitordir: env = wrappers.Monitor(gym.make(self.env_id), monitordir, force=True, video_callable=(lambda ep: True)) else: env = gym.make(self.env_id) total_rewards = [] total_steps = [] for _ in range(n): state = env.reset() done = False total_reward = 0 steps = 0 while not done: steps += 1 action = self.policy.sample_action(state) next_state, reward, done, _ = env.step(action[0]) if verbose: mean, sd = self.policy(np.atleast_2d(state)) print(mean, sd) print(reward) total_reward += reward if done: break else: state = next_state total_rewards.append(total_reward) total_steps.append(steps) print() print(total_reward, steps) print() return total_rewards, total_steps
class DDPGAgent: MAX_EXPERIENCES = 30000 MIN_EXPERIENCES = 300 ENV_ID = "Pendulum-v0" ACTION_SPACE = 1 OBSERVATION_SPACE = 3 UPDATE_PERIOD = 4 START_EPISODES = 20 TAU = 0.02 GAMMA = 0.99 BATCH_SIZE = 32 def __init__(self): self.env = gym.make(self.ENV_ID) self.env.max_episode_steps = 1000 self.actor_network = ActorNetwork(action_space=self.ACTION_SPACE) self.target_actor_network = ActorNetwork( action_space=self.ACTION_SPACE) self.critic_network = CriticNetwork() self.target_critic_network = CriticNetwork() self.stdev = 0.2 self.buffer = ReplayBuffer(max_experiences=self.MAX_EXPERIENCES) self.global_steps = 0 self.hiscore = None self._build_networks() def _build_networks(self): """パラメータの初期化 """ dummy_state = np.random.normal(0, 0.1, size=self.OBSERVATION_SPACE) dummy_state = (dummy_state[np.newaxis, ...]).astype(np.float32) dummy_action = np.random.normal(0, 0.1, size=self.ACTION_SPACE) dummy_action = (dummy_action[np.newaxis, ...]).astype(np.float32) self.actor_network.call(dummy_state) self.target_actor_network.call(dummy_state) self.target_actor_network.set_weights(self.actor_network.get_weights()) self.critic_network.call(dummy_state, dummy_action, training=False) self.target_critic_network.call(dummy_state, dummy_action, training=False) self.target_critic_network.set_weights( self.critic_network.get_weights()) def play(self, n_episodes): total_rewards = [] recent_scores = collections.deque(maxlen=10) for n in range(n_episodes): if n <= self.START_EPISODES: total_reward, localsteps = self.play_episode(random=True) else: total_reward, localsteps = self.play_episode() total_rewards.append(total_reward) recent_scores.append(total_reward) recent_average_score = sum(recent_scores) / len(recent_scores) print(f"Episode {n}: {total_reward}") print(f"Local steps {localsteps}") print(f"Experiences {len(self.buffer)}") print(f"Global step {self.global_steps}") print(f"Noise stdev {self.stdev}") print(f"recent average score {recent_average_score}") print() if (self.hiscore is None) or (recent_average_score > self.hiscore): self.hiscore = recent_average_score print(f"HISCORE Updated: {self.hiscore}") self.save_model() return total_rewards def play_episode(self, random=False): total_reward = 0 steps = 0 done = False state = self.env.reset() while not done: if random: action = np.random.uniform(-2, 2, size=self.ACTION_SPACE) else: action = self.actor_network.sample_action(state, noise=self.stdev) next_state, reward, done, _ = self.env.step(action) exp = Experience(state, action, reward, next_state, done) self.buffer.add_experience(exp) state = next_state total_reward += reward steps += 1 self.global_steps += 1 if self.global_steps % self.UPDATE_PERIOD == 0: self.update_network(self.BATCH_SIZE) self.update_target_network() return total_reward, steps def update_network(self, batch_size): if len(self.buffer) < self.MIN_EXPERIENCES: return (states, actions, rewards, next_states, dones) = self.buffer.get_minibatch(batch_size) next_actions = self.target_actor_network(next_states) next_qvalues = self.target_critic_network( next_states, next_actions).numpy().flatten() #: Compute taeget values and update CriticNetwork target_values = np.vstack([ reward + self.GAMMA * next_qvalue if not done else reward for reward, done, next_qvalue in zip(rewards, dones, next_qvalues) ]).astype(np.float32) with tf.GradientTape() as tape: qvalues = self.critic_network(states, actions) loss = tf.reduce_mean(tf.square(target_values - qvalues)) variables = self.critic_network.trainable_variables gradients = tape.gradient(loss, variables) self.critic_network.optimizer.apply_gradients(zip( gradients, variables)) #: Update ActorNetwork with tf.GradientTape() as tape: J = -1 * tf.reduce_mean( self.critic_network(states, self.actor_network(states))) variables = self.actor_network.trainable_variables gradients = tape.gradient(J, variables) self.actor_network.optimizer.apply_gradients(zip(gradients, variables)) def update_target_network(self): # soft-target update Actor target_actor_weights = self.target_actor_network.get_weights() actor_weights = self.actor_network.get_weights() assert len(target_actor_weights) == len(actor_weights) self.target_actor_network.set_weights( (1 - self.TAU) * np.array(target_actor_weights) + (self.TAU) * np.array(actor_weights)) # soft-target update Critic target_critic_weights = self.target_critic_network.get_weights() critic_weights = self.critic_network.get_weights() assert len(target_critic_weights) == len(critic_weights) self.target_critic_network.set_weights( (1 - self.TAU) * np.array(target_critic_weights) + (self.TAU) * np.array(critic_weights)) def save_model(self): self.actor_network.save_weights("checkpoints/actor") self.critic_network.save_weights("checkpoints/critic") def load_model(self): self.actor_network.load_weights("checkpoints/actor") self.target_actor_network.load_weights("checkpoints/actor") self.critic_network.load_weights("checkpoints/critic") self.target_critic_network.load_weights("checkpoints/critic") def test_play(self, n, monitordir, load_model=False): if load_model: self.load_model() if monitordir: env = wrappers.Monitor(gym.make(self.ENV_ID), monitordir, force=True, video_callable=(lambda ep: ep % 1 == 0)) else: env = gym.make(self.ENV_ID) for i in range(n): total_reward = 0 steps = 0 done = False state = env.reset() while not done: action = self.actor_network.sample_action(state, noise=False) next_state, reward, done, _ = env.step(action) state = next_state total_reward += reward steps += 1 print() print(f"Test Play {i}: {total_reward}") print(f"Steps:", steps) print()
class DDPGAgent: """ A DDPG Agent """ def __init__(self, state_dim, action_dim, lr_actor=1e-4, lr_critic=1e-4, lr_decay=.95, replay_buff_size=10000, gamma=.99, batch_size=128, random_seed=42, soft_update_tau=1e-3, actor_layer_dim_1=128, actor_layer_dim_2=128, actor_layer_dim_3=0, critic_layer_dim_1=128, critic_layer_dim_2=64, critic_layer_dim_3=0): """ Initialize model """ self.lr_actor = lr_actor self.gamma = gamma self.lr_critic = lr_critic self.lr_decay = lr_decay self.tau = soft_update_tau self.actor_local = ActorNetwork(state_dim, action_dim, actor_layer_dim_1, actor_layer_dim_2, actor_layer_dim_3).to(device=device) self.actor_target = ActorNetwork(state_dim, action_dim, actor_layer_dim_1, actor_layer_dim_2, actor_layer_dim_3).to(device=device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) self.critic_local = CriticNetwork(state_dim, action_dim, critic_layer_dim_1, critic_layer_dim_2, critic_layer_dim_3).to(device=device) self.critic_target = CriticNetwork( state_dim, action_dim, critic_layer_dim_1, critic_layer_dim_2, critic_layer_dim_3).to(device=device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic) self.noise = OUNoise(action_dim, random_seed) self.memory = ReplayBuffer(action_dim, replay_buff_size, batch_size, random_seed) self.path = "" def update_model(self, state, action, reward, next_state, done): """ Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value :experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples :gamma (float): discount factor """ self.memory.add(state, action, reward, next_state, done) if not self.memory.is_ready(): return experiences = self.memory.sample() states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)).detach() # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.smooth_l1_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def act(self, state, noise_t=0.0): """ Returns actions for given state as per current policy. """ if len(np.shape(state)) == 1: state = state.reshape(1, -1) state = torch.from_numpy(state).float().to(device=device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() action += self.noise.sample() * noise_t return np.clip(action, -1, 1).squeeze() def reset(self): self.noise.reset() def soft_update(self, local_model, target_model, tau): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target :local_model: PyTorch model (weights will be copied from) :target_model: PyTorch model (weights will be copied to) :tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """DDPG Agent""" def __init__(self, state_size, action_size,hd1_units=400, hd2_units=300 ,random_seed = 0, buffer_size = int(2e5), batch_size = 256, tau = 0.0005, actorLr =1e-3, criticLr = 1e-3, weight_decay = 0, update_every = 20, gamma = 0.99): """ :state_size (int): dimension of each state :action_size (int): dimension of each action :hd1_units (int) : number of the first hidden layer units :hd1_units (int) : number of the second hidden layer units :random_seed (int): random seed :buffer_size (int): replay buffer size :batch_size (int): batch size :tau (float): interpolation factor :actorLr (float): actor learning rate :criticLr (float): critic learning rate :weight_decay (float): Optimizer L2 penalty :update_every (int): learning frequency :gamma (float): Discount factor """ self.state_size = state_size self.action_size = action_size self.update_every = update_every self.gamma = gamma self.tau = tau random.seed(random_seed) # Actor & Target Networks self.actor_local = ActorNetwork(state_size, action_size, random_seed, hd1_units, hd2_units).to(device) self.actor_target = ActorNetwork(state_size, action_size, random_seed, hd1_units, hd2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=actorLr, weight_decay = weight_decay) # Critic & Target Networks self.critic_local = CriticNetwork(state_size, action_size, random_seed, 400, 300).to(device) self.critic_target = CriticNetwork(state_size, action_size, random_seed, 400, 300).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=criticLr, weight_decay=weight_decay) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): # store transition self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # Learn, if enough samples are available in memory if len(self.memory) > 10000: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, eps, add_noise=True): state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() # manual action clipping return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ :experiences (Tuple): Transition parameters (s, a, r, s', done) :gamma (float): Discount factor """ states, actions, rewards, next_states, dones = experiences # Update critic # Get the predicted next-state actions and Q values from target nets with torch.no_grad(): actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 5) self.critic_optimizer.step() # Updat Actor # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 5) self.actor_optimizer.step() # update targets self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """ local_model: Source target_model: Destination tau (float): Interpolation factor """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DDPGAgent(): def __init__(self, state_space, action_space, buffer_size, batch_size,learning_rate_actor, learning_rate_critic,update_rate, gamma, tau, device, seed, num_agents, epsilon, epsilon_decay, epsilon_min): self.num_agents = num_agents self.action_space = action_space self.state_space = state_space self.buffer_size = buffer_size self.batch_size = batch_size self.step_count = 0. self.update_rate = update_rate self.tau = tau self.seed = seed self.device= device self.gamma = gamma self.actor_local_network = ActorNetwork(state_space, action_space, device, seed).to(device) self.actor_target_network = ActorNetwork(state_space, action_space, device, seed).to(device) self.critic_local_network = CriticNetwork(state_space, action_space, device, seed).to(device) self.critic_target_network = CriticNetwork(state_space, action_space, device, seed).to(device) self.actor_optimizer = torch.optim.Adam(self.actor_local_network.parameters(), lr=learning_rate_actor) self.critic_optimizer = torch.optim.Adam(self.critic_local_network.parameters(), lr=learning_rate_critic) self.noise = OUNoise(action_space, seed) self.memory = ReplayBuffer(buffer_size = self.buffer_size, batch_size=self.batch_size, device=device, seed=seed) self.epsilon = epsilon self.epsilon_decay = epsilon_decay self.epsilon_min = epsilon_min def reset(self): self.noise.reset() def act(self, state, epsilon, add_noise = True): # if random.random() > epsilon: state = torch.from_numpy(state).float().to(self.device) self.actor_local_network.eval() with torch.no_grad(): action = self.actor_local_network(state).cpu().data.numpy() self.actor_local_network.train() if add_noise: action += self.noise.sample()*self.epsilon # else: # action = np.random.randn(self.num_agents, self.action_space) return np.clip(action, -1,1) def step(self, states, actions, rewards, next_states, dones): for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) #elf.memory.add(state, action, reward, next_state, done) self.step_count = (self.step_count+1) % self.update_rate if self.step_count == 0 and len(self.memory)>self.batch_size: self.learn(self.gamma) def learn(self, gamma): # interaction between actor & critic network states, actions, rewards, next_states, dones = self.memory.sample() next_actions = self.actor_target_network(next_states) q_target_next = self.critic_target_network(next_states,next_actions) q_target = rewards + gamma * q_target_next * (1-dones) q_expected = self.critic_local_network(states,actions) critic_loss = F.mse_loss(q_expected, q_target) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() self.soft_update(self.critic_target_network, self.critic_local_network) actor_preds = self.actor_local_network(states) actor_loss = - self.critic_local_network(states, actor_preds).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.actor_target_network , self.actor_local_network) self.epsilon -= self.epsilon_decay self.epsilon = max(self.epsilon_min, self.epsilon) self.noise.reset() def soft_update(self, target, local): for target_params, local_params in zip(target.parameters(), local.parameters()): target_params.data.copy_(self.tau*local_params.data + (1.0-self.tau)*target_params.data)