def main(): logger.info("-----------------Carla_SAC-------------------") logger.set_dir('./{}_eval'.format(args.env)) # env for eval eval_env_params = EnvConfig['test_env_params'] eval_env = LocalEnv(args.env, eval_env_params) obs_dim = eval_env.obs_dim action_dim = eval_env.action_dim # Initialize model, algorithm, agent if args.framework == 'torch': CarlaModel, SAC, CarlaAgent = TorchModel, TorchSAC, TorchAgent elif args.framework == 'paddle': CarlaModel, SAC, CarlaAgent = PaddleModel, PaddleSAC, PaddleAgent model = CarlaModel(obs_dim, action_dim) algorithm = SAC( model, gamma=GAMMA, tau=TAU, alpha=ALPHA, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = CarlaAgent(algorithm) # restore trained agent agent.restore('./{}'.format(args.restore_model)) # Evaluate episode for episode in range(args.eval_episodes): episode_reward = run_episode(agent, eval_env) tensorboard.add_scalar('eval/episode_reward', episode_reward, episode) logger.info('Evaluation episode reward: {}'.format(episode_reward))
def log_metrics(self): """ Log metrics of learner and actors """ if self.start_time is None: return metrics = [] while True: try: metric = self.remote_metrics_queue.get_nowait() metrics.append(metric) except queue.Empty: break episode_rewards, episode_steps = [], [] for x in metrics: episode_rewards.extend(x['episode_rewards']) episode_steps.extend(x['episode_steps']) max_episode_rewards, mean_episode_rewards, min_episode_rewards, \ max_episode_steps, mean_episode_steps, min_episode_steps =\ None, None, None, None, None, None if episode_rewards: mean_episode_rewards = np.mean(np.array(episode_rewards).flatten()) max_episode_rewards = np.max(np.array(episode_rewards).flatten()) min_episode_rewards = np.min(np.array(episode_rewards).flatten()) mean_episode_steps = np.mean(np.array(episode_steps).flatten()) max_episode_steps = np.max(np.array(episode_steps).flatten()) min_episode_steps = np.min(np.array(episode_steps).flatten()) metric = { 'Sample_steps': self.sample_total_steps, 'max_episode_rewards': max_episode_rewards, 'mean_episode_rewards': mean_episode_rewards, 'min_episode_rewards': min_episode_rewards, 'max_episode_steps': max_episode_steps, 'mean_episode_steps': mean_episode_steps, 'min_episode_steps': min_episode_steps, 'sample_queue_size': self.sample_data_queue.qsize(), 'total_params_sync': self.total_params_sync, 'cache_params_sent_cnt': self.cache_params_sent_cnt, 'total_loss': self.total_loss_stat.mean, 'pi_loss': self.pi_loss_stat.mean, 'vf_loss': self.vf_loss_stat.mean, 'entropy': self.entropy_stat.mean, 'kl': self.kl_stat.mean, 'learn_time_s': self.learn_time_stat.mean, 'elapsed_time_s': int(time.time() - self.start_time), 'lr': self.lr, 'entropy_coeff': self.entropy_coeff, } for key, value in metric.items(): if value is not None: tensorboard.add_scalar(key, value, self.sample_total_steps) logger.info(metric)
def train(self, num_frames: int, plotting_interval: int = 200, plot: bool = False): """Train the agent.""" self.is_test = False state = self.env.reset() update_cnt = 0 losses = [] scores = [] score = 0 for frame_idx in range(1, num_frames + 1): action = self.sample(state) next_state, reward, done = self.step(action) state = next_state score += reward # NoisyNet: removed decrease of epsilon # PER: increase beta fraction = min(frame_idx / num_frames, 1.0) self.beta = self.beta + fraction * (1.0 - self.beta) # if episode ends if done: state = self.env.reset() scores.append(score) tensorboard.add_scalar('score', score) score = 0 # if training is ready if len(self.memory) >= self.batch_size: loss = self.learn() losses.append(loss) tensorboard.add_scalar('loss', loss) update_cnt += 1 # if hard update is needed if update_cnt % self.target_update == 0: self.algorithm._target_hard_update() # plotting if frame_idx % plotting_interval == 0: self._plot(frame_idx, scores, losses, plot=plot) self.env.close()
def get_obs(self): for i in range(self.env_num): self.total_steps += 1 self.episode_steps_list[i] += 1 self.episode_reward_list[i] += self.reward_list[i] self.obs_list[i] = self.next_obs_list[i] if self.done_list[i] or self.episode_steps_list[ i] >= self._max_episode_steps: tensorboard.add_scalar('train/episode_reward_env{}'.format(i), self.episode_reward_list[i], self.total_steps) logger.info('Train env {} done, Reward: {}'.format( i, self.episode_reward_list[i])) self.episode_steps_list[i] = 0 self.episode_reward_list[i] = 0 obs_list_i = self.env_list[i].reset() self.obs_list[i] = obs_list_i.get() self.obs_list[i] = np.array(self.obs_list[i]) return self.obs_list
def learn(self): """Each iteration: 1. Performs numEps episodes of self-play. 2. Retrains neural network with examples in trainExamplesHistory (which has a maximum length of numItersForTrainExamplesHistory). 3. Evaluates the new neural network with the test dataset. 4. Pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ # create remote actors to run tasks (self-play/pitting/evaluate_test_dataset) in parallel. self._create_remote_actors() for iteration in range(1, self.args.numIters + 1): logger.info('Starting Iter #{} ...'.format(iteration)) #################### logger.info('Step1: self-play in parallel...') iterationTrainExamples = [] # update weights of remote actors to the latest weights, and ask them to run self-play task for signal_queue in self.remote_actors_signal_queues: signal_queue.put({"task": "self-play"}) # wait for all remote actors (a total of self.args.actors_num) to return the self-play results for _ in range(self.args.actors_num): result = self.remote_actors_return_queue.get() iterationTrainExamples.extend(result["self-play"]) # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) if len(self.trainExamplesHistory ) > self.args.numItersForTrainExamplesHistory: logger.warning("Removing the oldest entry in trainExamples.") self.trainExamplesHistory.pop(0) self.saveTrainExamples(iteration) # backup history to a file #################### logger.info('Step2: train neural network...') # shuffle examples before training trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) # training new network, keeping a copy of the old one self.current_agent.save( os.path.join(self.args.checkpoint, 'temp.pth.tar')) self.previous_agent.restore( os.path.join(self.args.checkpoint, 'temp.pth.tar')) self.current_agent.learn(trainExamples) #################### logger.info('Step3: evaluate test dataset in parallel...') cnt = 0 # update weights of remote actors to the latest weights, and ask them to evaluate assigned test dataset for i, data in enumerate( split_group(self.test_dataset, len(self.test_dataset) // self.args.actors_num)): self.remote_actors_signal_queues[i].put({ "task": "evaluate_test_dataset", "test_dataset": data }) cnt += len(data) perfect_moves_cnt, good_moves_cnt = 0, 0 # wait for all remote actors (a total of self.args.actors_num) to return the evaluating results for _ in range(self.args.actors_num): (perfect_moves, good_moves) = self.remote_actors_return_queue.get( )["evaluate_test_dataset"] perfect_moves_cnt += perfect_moves good_moves_cnt += good_moves logger.info('perfect moves rate: {}, good moves rate: {}'.format( perfect_moves_cnt / cnt, good_moves_cnt / cnt)) tensorboard.add_scalar('perfect_moves_rate', perfect_moves_cnt / cnt, iteration) tensorboard.add_scalar('good_moves_rate', good_moves_cnt / cnt, iteration) #################### logger.info( 'Step4: pitting against previous generation in parallel...') # transfer weights of previous generation and current generation to the remote actors, and ask them to pit. for signal_queue in self.remote_actors_signal_queues: signal_queue.put({"task": "pitting"}) previous_wins, current_wins, draws = 0, 0, 0 for _ in range(self.args.actors_num): (pwins_, cwins_, draws_) = self.remote_actors_return_queue.get()["pitting"] previous_wins += pwins_ current_wins += cwins_ draws += draws_ logger.info('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (current_wins, previous_wins, draws)) if previous_wins + current_wins == 0 or float(current_wins) / ( previous_wins + current_wins) < self.args.updateThreshold: logger.info('REJECTING NEW MODEL') self.current_agent.restore( os.path.join(self.args.checkpoint, 'temp.pth.tar')) else: logger.info('ACCEPTING NEW MODEL') self.current_agent.save( os.path.join(self.args.checkpoint, 'best.pth.tar')) self.current_agent.save( os.path.join(self.args.checkpoint, self.getCheckpointFile(iteration)))
def main(): logger.info("-----------------Carla_SAC-------------------") logger.set_dir('./{}_train'.format(args.env)) # Parallel environments for training train_envs_params = EnvConfig['train_envs_params'] env_num = EnvConfig['env_num'] env_list = ParallelEnv(args.env, args.xparl_addr, train_envs_params) # env for eval eval_env_params = EnvConfig['eval_env_params'] eval_env = LocalEnv(args.env, eval_env_params) obs_dim = eval_env.obs_dim action_dim = eval_env.action_dim # Initialize model, algorithm, agent, replay_memory if args.framework == 'torch': CarlaModel, SAC, CarlaAgent = TorchModel, TorchSAC, TorchAgent elif args.framework == 'paddle': CarlaModel, SAC, CarlaAgent = PaddleModel, PaddleSAC, PaddleAgent model = CarlaModel(obs_dim, action_dim) algorithm = SAC( model, gamma=GAMMA, tau=TAU, alpha=ALPHA, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = CarlaAgent(algorithm) rpm = ReplayMemory( max_size=MEMORY_SIZE, obs_dim=obs_dim, act_dim=action_dim) total_steps = 0 last_save_steps = 0 test_flag = 0 obs_list = env_list.reset() while total_steps < args.train_total_steps: # Train episode if rpm.size() < WARMUP_STEPS: action_list = [ np.random.uniform(-1, 1, size=action_dim) for _ in range(env_num) ] else: action_list = [agent.sample(obs) for obs in obs_list] next_obs_list, reward_list, done_list, info_list = env_list.step( action_list) # Store data in replay memory for i in range(env_num): rpm.append(obs_list[i], action_list[i], reward_list[i], next_obs_list[i], done_list[i]) obs_list = env_list.get_obs() total_steps = env_list.total_steps # Train agent after collecting sufficient data if rpm.size() >= WARMUP_STEPS: batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = rpm.sample_batch( BATCH_SIZE) agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal) # Save agent if total_steps > int(1e5) and total_steps > last_save_steps + int(1e4): agent.save('./{}_model/step_{}_model.ckpt'.format( args.framework, total_steps)) last_save_steps = total_steps # Evaluate episode if (total_steps + 1) // args.test_every_steps >= test_flag: while (total_steps + 1) // args.test_every_steps >= test_flag: test_flag += 1 avg_reward = run_evaluate_episodes(agent, eval_env, EVAL_EPISODES) tensorboard.add_scalar('eval/episode_reward', avg_reward, total_steps) logger.info( 'Total steps {}, Evaluation over {} episodes, Average reward: {}' .format(total_steps, EVAL_EPISODES, avg_reward))