示例#1
0
def main():
    logger.info("-----------------Carla_SAC-------------------")
    logger.set_dir('./{}_eval'.format(args.env))

    # env for eval
    eval_env_params = EnvConfig['test_env_params']
    eval_env = LocalEnv(args.env, eval_env_params)

    obs_dim = eval_env.obs_dim
    action_dim = eval_env.action_dim

    # Initialize model, algorithm, agent
    if args.framework == 'torch':
        CarlaModel, SAC, CarlaAgent = TorchModel, TorchSAC, TorchAgent
    elif args.framework == 'paddle':
        CarlaModel, SAC, CarlaAgent = PaddleModel, PaddleSAC, PaddleAgent
    model = CarlaModel(obs_dim, action_dim)
    algorithm = SAC(
        model,
        gamma=GAMMA,
        tau=TAU,
        alpha=ALPHA,
        actor_lr=ACTOR_LR,
        critic_lr=CRITIC_LR)
    agent = CarlaAgent(algorithm)
    # restore trained agent
    agent.restore('./{}'.format(args.restore_model))

    # Evaluate episode
    for episode in range(args.eval_episodes):
        episode_reward = run_episode(agent, eval_env)
        tensorboard.add_scalar('eval/episode_reward', episode_reward, episode)
        logger.info('Evaluation episode reward: {}'.format(episode_reward))
示例#2
0
    def log_metrics(self):
        """ Log metrics of learner and actors
        """
        if self.start_time is None:
            return

        metrics = []
        while True:
            try:
                metric = self.remote_metrics_queue.get_nowait()
                metrics.append(metric)
            except queue.Empty:
                break

        episode_rewards, episode_steps = [], []
        for x in metrics:
            episode_rewards.extend(x['episode_rewards'])
            episode_steps.extend(x['episode_steps'])
        max_episode_rewards, mean_episode_rewards, min_episode_rewards, \
                max_episode_steps, mean_episode_steps, min_episode_steps =\
                None, None, None, None, None, None
        if episode_rewards:
            mean_episode_rewards = np.mean(np.array(episode_rewards).flatten())
            max_episode_rewards = np.max(np.array(episode_rewards).flatten())
            min_episode_rewards = np.min(np.array(episode_rewards).flatten())

            mean_episode_steps = np.mean(np.array(episode_steps).flatten())
            max_episode_steps = np.max(np.array(episode_steps).flatten())
            min_episode_steps = np.min(np.array(episode_steps).flatten())

        metric = {
            'Sample_steps': self.sample_total_steps,
            'max_episode_rewards': max_episode_rewards,
            'mean_episode_rewards': mean_episode_rewards,
            'min_episode_rewards': min_episode_rewards,
            'max_episode_steps': max_episode_steps,
            'mean_episode_steps': mean_episode_steps,
            'min_episode_steps': min_episode_steps,
            'sample_queue_size': self.sample_data_queue.qsize(),
            'total_params_sync': self.total_params_sync,
            'cache_params_sent_cnt': self.cache_params_sent_cnt,
            'total_loss': self.total_loss_stat.mean,
            'pi_loss': self.pi_loss_stat.mean,
            'vf_loss': self.vf_loss_stat.mean,
            'entropy': self.entropy_stat.mean,
            'kl': self.kl_stat.mean,
            'learn_time_s': self.learn_time_stat.mean,
            'elapsed_time_s': int(time.time() - self.start_time),
            'lr': self.lr,
            'entropy_coeff': self.entropy_coeff,
        }

        for key, value in metric.items():
            if value is not None:
                tensorboard.add_scalar(key, value, self.sample_total_steps)

        logger.info(metric)
示例#3
0
    def train(self,
              num_frames: int,
              plotting_interval: int = 200,
              plot: bool = False):
        """Train the agent."""
        self.is_test = False

        state = self.env.reset()
        update_cnt = 0
        losses = []
        scores = []
        score = 0

        for frame_idx in range(1, num_frames + 1):
            action = self.sample(state)
            next_state, reward, done = self.step(action)

            state = next_state
            score += reward

            # NoisyNet: removed decrease of epsilon

            # PER: increase beta
            fraction = min(frame_idx / num_frames, 1.0)
            self.beta = self.beta + fraction * (1.0 - self.beta)

            # if episode ends
            if done:
                state = self.env.reset()
                scores.append(score)
                tensorboard.add_scalar('score', score)
                score = 0

            # if training is ready
            if len(self.memory) >= self.batch_size:
                loss = self.learn()
                losses.append(loss)
                tensorboard.add_scalar('loss', loss)
                update_cnt += 1

                # if hard update is needed
                if update_cnt % self.target_update == 0:
                    self.algorithm._target_hard_update()

            # plotting
            if frame_idx % plotting_interval == 0:
                self._plot(frame_idx, scores, losses, plot=plot)

        self.env.close()
示例#4
0
    def get_obs(self):
        for i in range(self.env_num):
            self.total_steps += 1
            self.episode_steps_list[i] += 1
            self.episode_reward_list[i] += self.reward_list[i]

            self.obs_list[i] = self.next_obs_list[i]
            if self.done_list[i] or self.episode_steps_list[
                    i] >= self._max_episode_steps:
                tensorboard.add_scalar('train/episode_reward_env{}'.format(i),
                                       self.episode_reward_list[i],
                                       self.total_steps)
                logger.info('Train env {} done, Reward: {}'.format(
                    i, self.episode_reward_list[i]))

                self.episode_steps_list[i] = 0
                self.episode_reward_list[i] = 0
                obs_list_i = self.env_list[i].reset()
                self.obs_list[i] = obs_list_i.get()
                self.obs_list[i] = np.array(self.obs_list[i])
        return self.obs_list
示例#5
0
文件: Coach.py 项目: YuechengLiu/PARL
    def learn(self):
        """Each iteration:
        1. Performs numEps episodes of self-play.
        2. Retrains neural network with examples in trainExamplesHistory
           (which has a maximum length of numItersForTrainExamplesHistory).
        3. Evaluates the new neural network with the test dataset.
        4. Pits the new neural network against the old one and accepts it
           only if it wins >= updateThreshold fraction of games.
        """

        # create remote actors to run tasks (self-play/pitting/evaluate_test_dataset) in parallel.
        self._create_remote_actors()

        for iteration in range(1, self.args.numIters + 1):
            logger.info('Starting Iter #{} ...'.format(iteration))

            ####################
            logger.info('Step1: self-play in parallel...')
            iterationTrainExamples = []
            # update weights of remote actors to the latest weights, and ask them to run self-play task
            for signal_queue in self.remote_actors_signal_queues:
                signal_queue.put({"task": "self-play"})
            # wait for all remote actors (a total of self.args.actors_num) to return the self-play results
            for _ in range(self.args.actors_num):
                result = self.remote_actors_return_queue.get()
                iterationTrainExamples.extend(result["self-play"])

            # save the iteration examples to the history
            self.trainExamplesHistory.append(iterationTrainExamples)
            if len(self.trainExamplesHistory
                   ) > self.args.numItersForTrainExamplesHistory:
                logger.warning("Removing the oldest entry in trainExamples.")
                self.trainExamplesHistory.pop(0)
            self.saveTrainExamples(iteration)  # backup history to a file

            ####################
            logger.info('Step2: train neural network...')
            # shuffle examples before training
            trainExamples = []
            for e in self.trainExamplesHistory:
                trainExamples.extend(e)
            shuffle(trainExamples)

            # training new network, keeping a copy of the old one
            self.current_agent.save(
                os.path.join(self.args.checkpoint, 'temp.pth.tar'))
            self.previous_agent.restore(
                os.path.join(self.args.checkpoint, 'temp.pth.tar'))

            self.current_agent.learn(trainExamples)

            ####################
            logger.info('Step3: evaluate test dataset in parallel...')
            cnt = 0
            # update weights of remote actors to the latest weights, and ask them to evaluate assigned test dataset
            for i, data in enumerate(
                    split_group(self.test_dataset,
                                len(self.test_dataset) //
                                self.args.actors_num)):
                self.remote_actors_signal_queues[i].put({
                    "task": "evaluate_test_dataset",
                    "test_dataset": data
                })
                cnt += len(data)
            perfect_moves_cnt, good_moves_cnt = 0, 0
            # wait for all remote actors (a total of self.args.actors_num) to return the evaluating results
            for _ in range(self.args.actors_num):
                (perfect_moves,
                 good_moves) = self.remote_actors_return_queue.get(
                 )["evaluate_test_dataset"]
                perfect_moves_cnt += perfect_moves
                good_moves_cnt += good_moves
            logger.info('perfect moves rate: {}, good moves rate: {}'.format(
                perfect_moves_cnt / cnt, good_moves_cnt / cnt))
            tensorboard.add_scalar('perfect_moves_rate',
                                   perfect_moves_cnt / cnt, iteration)
            tensorboard.add_scalar('good_moves_rate', good_moves_cnt / cnt,
                                   iteration)

            ####################
            logger.info(
                'Step4: pitting against previous generation in parallel...')
            # transfer weights of previous generation and current generation to the remote actors, and ask them to pit.
            for signal_queue in self.remote_actors_signal_queues:
                signal_queue.put({"task": "pitting"})
            previous_wins, current_wins, draws = 0, 0, 0
            for _ in range(self.args.actors_num):
                (pwins_, cwins_,
                 draws_) = self.remote_actors_return_queue.get()["pitting"]
                previous_wins += pwins_
                current_wins += cwins_
                draws += draws_

            logger.info('NEW/PREV WINS : %d / %d ; DRAWS : %d' %
                        (current_wins, previous_wins, draws))
            if previous_wins + current_wins == 0 or float(current_wins) / (
                    previous_wins + current_wins) < self.args.updateThreshold:
                logger.info('REJECTING NEW MODEL')
                self.current_agent.restore(
                    os.path.join(self.args.checkpoint, 'temp.pth.tar'))
            else:
                logger.info('ACCEPTING NEW MODEL')
                self.current_agent.save(
                    os.path.join(self.args.checkpoint, 'best.pth.tar'))
            self.current_agent.save(
                os.path.join(self.args.checkpoint,
                             self.getCheckpointFile(iteration)))
示例#6
0
def main():
    logger.info("-----------------Carla_SAC-------------------")
    logger.set_dir('./{}_train'.format(args.env))

    # Parallel environments for training
    train_envs_params = EnvConfig['train_envs_params']
    env_num = EnvConfig['env_num']
    env_list = ParallelEnv(args.env, args.xparl_addr, train_envs_params)

    # env for eval
    eval_env_params = EnvConfig['eval_env_params']
    eval_env = LocalEnv(args.env, eval_env_params)

    obs_dim = eval_env.obs_dim
    action_dim = eval_env.action_dim

    # Initialize model, algorithm, agent, replay_memory
    if args.framework == 'torch':
        CarlaModel, SAC, CarlaAgent = TorchModel, TorchSAC, TorchAgent
    elif args.framework == 'paddle':
        CarlaModel, SAC, CarlaAgent = PaddleModel, PaddleSAC, PaddleAgent
    model = CarlaModel(obs_dim, action_dim)
    algorithm = SAC(
        model,
        gamma=GAMMA,
        tau=TAU,
        alpha=ALPHA,
        actor_lr=ACTOR_LR,
        critic_lr=CRITIC_LR)
    agent = CarlaAgent(algorithm)
    rpm = ReplayMemory(
        max_size=MEMORY_SIZE, obs_dim=obs_dim, act_dim=action_dim)

    total_steps = 0
    last_save_steps = 0
    test_flag = 0

    obs_list = env_list.reset()

    while total_steps < args.train_total_steps:
        # Train episode
        if rpm.size() < WARMUP_STEPS:
            action_list = [
                np.random.uniform(-1, 1, size=action_dim)
                for _ in range(env_num)
            ]
        else:
            action_list = [agent.sample(obs) for obs in obs_list]
        next_obs_list, reward_list, done_list, info_list = env_list.step(
            action_list)

        # Store data in replay memory
        for i in range(env_num):
            rpm.append(obs_list[i], action_list[i], reward_list[i],
                       next_obs_list[i], done_list[i])

        obs_list = env_list.get_obs()
        total_steps = env_list.total_steps
        # Train agent after collecting sufficient data
        if rpm.size() >= WARMUP_STEPS:
            batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = rpm.sample_batch(
                BATCH_SIZE)
            agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs,
                        batch_terminal)

        # Save agent
        if total_steps > int(1e5) and total_steps > last_save_steps + int(1e4):
            agent.save('./{}_model/step_{}_model.ckpt'.format(
                args.framework, total_steps))
            last_save_steps = total_steps

        # Evaluate episode
        if (total_steps + 1) // args.test_every_steps >= test_flag:
            while (total_steps + 1) // args.test_every_steps >= test_flag:
                test_flag += 1
            avg_reward = run_evaluate_episodes(agent, eval_env, EVAL_EPISODES)
            tensorboard.add_scalar('eval/episode_reward', avg_reward,
                                   total_steps)
            logger.info(
                'Total steps {}, Evaluation over {} episodes, Average reward: {}'
                .format(total_steps, EVAL_EPISODES, avg_reward))