def main(args): device = "cuda" if args.cuda else "cpu" mp.set_start_method('spawn') # Input Experiment Hyperparameters hp = SACHP(EXP_NAME=args.name, DEVICE=device, ENV_NAME=args.env, N_ROLLOUT_PROCESSES=3, LEARNING_RATE=0.0001, EXP_GRAD_RATIO=10, BATCH_SIZE=256, GAMMA=0.95, REWARD_STEPS=3, ALPHA=0.015, LOG_SIG_MAX=2, LOG_SIG_MIN=-20, EPSILON=1e-6, REPLAY_SIZE=100000, REPLAY_INITIAL=512, SAVE_FREQUENCY=100000, GIF_FREQUENCY=100000, TOTAL_GRAD_STEPS=1000000) wandb.init(project='RoboCIn-RL', name=hp.EXP_NAME, entity='robocin', config=hp.to_dict()) current_time = datetime.datetime.now().strftime('%b-%d_%H-%M-%S') tb_path = os.path.join( 'runs', current_time + '_' + hp.ENV_NAME + '_' + hp.EXP_NAME) # Training sac = SAC(hp) buffer = ReplayBuffer(buffer_size=hp.REPLAY_SIZE, observation_space=hp.observation_space, action_space=hp.action_space, device=hp.DEVICE) # Playing sac.share_memory() exp_queue = mp.Queue(maxsize=hp.EXP_GRAD_RATIO) finish_event = mp.Event() gif_req_m = mp.Value('i', -1) data_proc = mp.Process(target=rollout, args=(sac, device, exp_queue, finish_event, gif_req_m, hp)) data_proc.start() n_grads = 0 n_samples = 0 n_episodes = 0 best_reward = None last_gif = None try: while n_grads < hp.TOTAL_GRAD_STEPS: metrics = {} ep_infos = list() st_time = time.perf_counter() # Collect EXP_GRAD_RATIO sample for each grad step new_samples = 0 while new_samples < hp.EXP_GRAD_RATIO: exp = exp_queue.get() if exp is None: raise Exception # got None value in queue safe_exp = copy.deepcopy(exp) del (exp) # Dict is returned with end of episode info if isinstance(safe_exp, dict): logs = { "ep_info/" + key: value for key, value in safe_exp.items() if 'truncated' not in key } ep_infos.append(logs) n_episodes += 1 else: if safe_exp.last_state is not None: last_state = safe_exp.last_state else: last_state = safe_exp.state buffer.add(obs=safe_exp.state, next_obs=last_state, action=safe_exp.action, reward=safe_exp.reward, done=False if safe_exp.last_state is not None else True) new_samples += 1 n_samples += new_samples sample_time = time.perf_counter() # Only start training after buffer is larger than initial value if buffer.size() < hp.REPLAY_INITIAL: continue # Sample a batch and load it as a tensor on device batch = buffer.sample(hp.BATCH_SIZE) metrics["train/loss_pi"], metrics["train/loss_Q1"], \ metrics["train/loss_Q2"], metrics["train/loss_alpha"], \ metrics["train/alpha"] = sac.update(batch=batch, metrics=metrics) n_grads += 1 grad_time = time.perf_counter() metrics['speed/samples'] = new_samples / (sample_time - st_time) metrics['speed/grad'] = 1 / (grad_time - sample_time) metrics['speed/total'] = 1 / (grad_time - st_time) metrics['counters/samples'] = n_samples metrics['counters/grads'] = n_grads metrics['counters/episodes'] = n_episodes metrics["counters/buffer_len"] = buffer.size() if ep_infos: for key in ep_infos[0].keys(): metrics[key] = np.mean([info[key] for info in ep_infos]) # Log metrics wandb.log(metrics) if hp.SAVE_FREQUENCY and n_grads % hp.SAVE_FREQUENCY == 0: save_checkpoint(hp=hp, metrics={ 'alpha': sac.alpha, 'n_samples': n_samples, 'n_grads': n_grads, 'n_episodes': n_episodes }, pi=sac.pi, Q=sac.Q, pi_opt=sac.pi_opt, Q_opt=sac.Q_opt) if hp.GIF_FREQUENCY and n_grads % hp.GIF_FREQUENCY == 0: gif_req_m.value = n_grads except KeyboardInterrupt: print("...Finishing...") finish_event.set() finally: if exp_queue: while exp_queue.qsize() > 0: exp_queue.get() print('queue is empty') print("Waiting for threads to finish...") data_proc.terminate() data_proc.join() del (exp_queue) del (sac) finish_event.set()
if safe_exp.last_state is not None: last_state = safe_exp.last_state else: last_state = safe_exp.state buffer.add(obs=safe_exp.state, next_obs=last_state, action=safe_exp.action, reward=safe_exp.reward, done=False if safe_exp.last_state is not None else True) new_samples += 1 n_samples += new_samples sample_time = time.perf_counter() # Only start training after buffer is larger than initial value if buffer.size() < hp.REPLAY_INITIAL: continue # Sample a batch and load it as a tensor on device batch = buffer.sample(hp.BATCH_SIZE) pi_loss, Q_loss1, Q_loss2, log_pi = loss_sac( alpha, hp.GAMMA**hp.REWARD_STEPS, batch, Q, pi, tgt_Q, device) # train Entropy parameter alpha_loss = -(log_alpha * (log_pi + target_entropy).detach()) alpha_loss = alpha_loss.mean() alpha_optim.zero_grad() alpha_loss.backward() alpha_optim.step()
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.0 #0 self.exploration_theta = 0.125 # 0.14 | 0.1 self.exploration_sigma = 0.0009 # 0.001 | 0.2 | 0.001 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.998 # 0.99 | 0.9 | discount factor self.tau = 0.099 # 0.001| 0.01 | 0.1 | 0.05 | for soft update of target parameters # Score tracker self.best_score = -np.inf self.score = 0 def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.score = 0 return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if self.memory.size() > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state # Score tracker self.score += reward if done: if self.score > self.best_score: self.best_score = self.score # self.best_score = max(self.score, self.best_score) def act(self, states): """Returns actions for given state(s) as per current policy.""" state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element # (states, actions, rewards, etc.) states = np.vstack([ xp.state for xp in experiences if xp is not None]) actions = np.array([ xp.action for xp in experiences if xp is not None]).astype( np.float32).reshape(-1, self.action_size) rewards = np.array([ xp.reward for xp in experiences if xp is not None]).astype( np.float32).reshape(-1, 1) dones = np.array([ xp.done for xp in experiences if xp is not None]).astype( np.uint8).reshape(-1, 1) next_states = np.vstack([ xp.next_state for xp in experiences if xp is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), \ "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)