Exemplo n.º 1
0
    def __init__(self, config):
        self.config = config

        env = gym.make(self.config['env_name'])
        self.config['obs_dim'] = env.observation_space.shape[0]
        self.config['act_dim'] = env.action_space.shape[0]

        self.obs_filter = MeanStdFilter(self.config['obs_dim'])
        self.noise = SharedNoiseTable(self.config['noise_size'])

        model = MujocoModel(self.config['act_dim'])
        algorithm = ES(model)
        self.agent = MujocoAgent(algorithm, self.config)

        self.latest_flat_weights = self.agent.get_flat_weights()
        self.latest_obs_filter = self.obs_filter.as_serializable()

        self.sample_total_episodes = 0
        self.sample_total_steps = 0

        self.actors_signal_input_queues = []
        self.actors_output_queues = []

        self.create_actors()

        self.eval_rewards_stat = WindowStat(self.config['report_window_size'])
        self.eval_lengths_stat = WindowStat(self.config['report_window_size'])
Exemplo n.º 2
0
    def __init__(self, config):
        self.config = config

        self.env = gym.make(self.config['env_name'])
        self.config['obs_dim'] = self.env.observation_space.shape[0]
        self.config['act_dim'] = self.env.action_space.shape[0]

        self.obs_filter = MeanStdFilter(self.config['obs_dim'])
        self.noise = SharedNoiseTable(self.config['noise_size'])

        model = MujocoModel(self.config['act_dim'])
        algorithm = ES(model)
        self.agent = MujocoAgent(algorithm, self.config)
Exemplo n.º 3
0
def main():
    env = gym.make(args.env)
    env = ActionMappingWrapper(env)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    obs_dim += 1  # add 1 to obs dim for time step feature

    scaler = Scaler(obs_dim)

    model = MujocoModel(obs_dim, act_dim)
    alg = parl.algorithms.PPO(model,
                              act_dim=act_dim,
                              policy_lr=model.policy_lr,
                              value_lr=model.value_lr)
    agent = MujocoAgent(alg,
                        obs_dim,
                        act_dim,
                        args.kl_targ,
                        loss_type=args.loss_type)

    # run a few episodes to initialize scaler
    collect_trajectories(env, agent, scaler, episodes=5)

    test_flag = 0
    total_steps = 0
    while total_steps < args.train_total_steps:
        trajectories = collect_trajectories(env,
                                            agent,
                                            scaler,
                                            episodes=args.episodes_per_batch)
        total_steps += sum([t['obs'].shape[0] for t in trajectories])
        total_train_rewards = sum([np.sum(t['rewards']) for t in trajectories])

        train_obs, train_actions, train_advantages, train_discount_sum_rewards = build_train_data(
            trajectories, agent)

        policy_loss, kl = agent.policy_learn(train_obs, train_actions,
                                             train_advantages)
        value_loss = agent.value_learn(train_obs, train_discount_sum_rewards)

        logger.info(
            'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}'
            .format(total_steps, total_train_rewards / args.episodes_per_batch,
                    policy_loss, kl, value_loss))
        if total_steps // args.test_every_steps >= test_flag:
            while total_steps // args.test_every_steps >= test_flag:
                test_flag += 1
            eval_reward = run_evaluate_episode(env, agent, scaler)
            logger.info('Steps {}, Evaluate reward: {}'.format(
                total_steps, eval_reward))
Exemplo n.º 4
0
def main():
    env = gym.make(args.env)
    env.seed(ENV_SEED)
    env = ActionMappingWrapper(env)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    model = MujocoModel(act_dim)
    algorithm = parl.algorithms.DDPG(model,
                                     gamma=GAMMA,
                                     tau=TAU,
                                     actor_lr=ACTOR_LR,
                                     critic_lr=CRITIC_LR)
    agent = MujocoAgent(algorithm, obs_dim, act_dim)

    rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim)

    while rpm.size() < MEMORY_WARMUP_SIZE:
        run_train_episode(env, agent, rpm)

    episode = 0
    while episode < args.train_total_episode:
        for i in range(50):
            train_reward = run_train_episode(env, agent, rpm)
            episode += 1
            logger.info('Episode: {} Reward: {}'.format(episode, train_reward))

        evaluate_reward = run_evaluate_episode(env, agent)
        logger.info('Episode {}, Evaluate reward: {}'.format(
            episode, evaluate_reward))
Exemplo n.º 5
0
def main():
    env = gym.make(args.env)
    env.seed(ENV_SEED)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    model = MujocoModel(act_dim, max_action)
    algorithm = parl.algorithms.TD3(model,
                                    max_action=max_action,
                                    gamma=GAMMA,
                                    tau=TAU,
                                    actor_lr=ACTOR_LR,
                                    critic_lr=CRITIC_LR)
    agent = MujocoAgent(algorithm, obs_dim, act_dim)

    rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim)

    test_flag = 0
    total_steps = 0
    while total_steps < args.train_total_steps:
        train_reward, steps = run_train_episode(env, agent, rpm)
        total_steps += steps
        logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
        summary.add_scalar('train/episode_reward', train_reward, total_steps)

        if total_steps // args.test_every_steps >= test_flag:
            while total_steps // args.test_every_steps >= test_flag:
                test_flag += 1
            evaluate_reward = run_evaluate_episode(env, agent)
            logger.info('Steps {}, Evaluate reward: {}'.format(
                total_steps, evaluate_reward))
            summary.add_scalar('eval/episode_reward', evaluate_reward,
                               total_steps)
Exemplo n.º 6
0
def main():
    env = gym.make(args.env)
    env.seed(args.seed)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    model = MujocoModel(obs_dim, act_dim, max_action)
    algorithm = ADER(model,
                     max_action=max_action,
                     gamma=GAMMA,
                     tau=TAU,
                     actor_lr=ACTOR_LR,
                     critic_lr=CRITIC_LR,
                     kappa=args.kappa,
                     epoch=args.epoch,
                     alpha=args.alpha)
    agent = MujocoAgent(algorithm, obs_dim, act_dim)

    rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim)

    test_flag = 0
    total_steps = 0
    while total_steps < args.train_total_steps:
        train_reward, steps = run_train_episode(env, agent, rpm)
        total_steps += steps
        logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))

        if total_steps // args.test_every_steps >= test_flag:
            while total_steps // args.test_every_steps >= test_flag:
                test_flag += 1
            evaluate_reward, evaluate_fall_rate, total_steps_list = run_evaluate_episode(
                env, agent)
            mean_steps = np.mean(total_steps_list)
            logger.info('Steps {}, Evaluate reward: {}, Fall rate: {}'.format(
                total_steps, evaluate_reward, evaluate_fall_rate))
            logger.info(
                'Steps {}, Mean episode steps: {}, Steps list: {}'.format(
                    total_steps, mean_steps, total_steps_list))
            res = {
                'eval_step': mean_steps,
                'fall_rate': evaluate_fall_rate,
                'Step': total_steps,
                'Value': evaluate_reward
            }
            csv_logger.log_dict(res)
Exemplo n.º 7
0
class Learner(object):
    def __init__(self, config):
        self.config = config

        env = gym.make(self.config['env_name'])
        self.config['obs_dim'] = env.observation_space.shape[0]
        self.config['act_dim'] = env.action_space.shape[0]

        self.obs_filter = MeanStdFilter(self.config['obs_dim'])
        self.noise = SharedNoiseTable(self.config['noise_size'])

        model = MujocoModel(self.config['act_dim'])
        algorithm = ES(model)
        self.agent = MujocoAgent(algorithm, self.config)

        self.latest_flat_weights = self.agent.get_flat_weights()
        self.latest_obs_filter = self.obs_filter.as_serializable()

        self.sample_total_episodes = 0
        self.sample_total_steps = 0

        self.actors_signal_input_queues = []
        self.actors_output_queues = []

        self.create_actors()

        self.eval_rewards_stat = WindowStat(self.config['report_window_size'])
        self.eval_lengths_stat = WindowStat(self.config['report_window_size'])

    def create_actors(self):
        """ create actors for parallel training.
        """

        parl.connect(self.config['master_address'])
        self.remote_count = 0
        for i in range(self.config['actor_num']):
            signal_queue = queue.Queue()
            output_queue = queue.Queue()
            self.actors_signal_input_queues.append(signal_queue)
            self.actors_output_queues.append(output_queue)

            self.remote_count += 1

            remote_thread = threading.Thread(target=self.run_remote_sample,
                                             args=(signal_queue, output_queue))
            remote_thread.setDaemon(True)
            remote_thread.start()

        logger.info('All remote actors are ready, begin to learn.')

    def run_remote_sample(self, signal_queue, output_queue):
        """ Sample data from remote actor or get filters of remote actor. 
        """
        remote_actor = Actor(self.config)
        while True:
            info = signal_queue.get()
            if info['signal'] == 'sample':
                result = remote_actor.sample(self.latest_flat_weights)
                output_queue.put(result)
            elif info['signal'] == 'get_filter':
                actor_filter = remote_actor.get_filter(flush_after=True)
                output_queue.put(actor_filter)
            elif info['signal'] == 'set_filter':
                remote_actor.set_filter(self.latest_obs_filter)
            else:
                raise NotImplementedError

    def step(self):
        """Run a step in ES.

        1. kick off all actors to synchronize weights and sample data;
        2. update parameters of the model based on sampled data.
        3. update global observation filter based on local filters of all actors, and synchronize global 
           filter to all actors.
        """
        num_episodes, num_timesteps = 0, 0
        results = []

        while num_episodes < self.config['min_episodes_per_batch'] or \
                num_timesteps < self.config['min_steps_per_batch']:
            # Send sample signal to all actors
            for q in self.actors_signal_input_queues:
                q.put({'signal': 'sample'})

            # Collect results from all actors
            for q in self.actors_output_queues:
                result = q.get()
                results.append(result)
                # result['noisy_lengths'] is a list of lists, where the inner lists have length 2.
                num_episodes += sum(
                    len(pair) for pair in result['noisy_lengths'])
                num_timesteps += sum(
                    sum(pair) for pair in result['noisy_lengths'])

        all_noise_indices = []
        all_training_rewards = []
        all_training_lengths = []
        all_eval_rewards = []
        all_eval_lengths = []

        for result in results:
            all_eval_rewards.extend(result['eval_rewards'])
            all_eval_lengths.extend(result['eval_lengths'])

            all_noise_indices.extend(result['noise_indices'])
            all_training_rewards.extend(result['noisy_rewards'])
            all_training_lengths.extend(result['noisy_lengths'])

        assert len(all_eval_rewards) == len(all_eval_lengths)
        assert (len(all_noise_indices) == len(all_training_rewards) ==
                len(all_training_lengths))

        self.sample_total_episodes += num_episodes
        self.sample_total_steps += num_timesteps

        eval_rewards = np.array(all_eval_rewards)
        eval_lengths = np.array(all_eval_lengths)
        noise_indices = np.array(all_noise_indices)
        noisy_rewards = np.array(all_training_rewards)
        noisy_lengths = np.array(all_training_lengths)

        # normalize rewards to (-0.5, 0.5)
        proc_noisy_rewards = utils.compute_centered_ranks(noisy_rewards)
        noises = [
            self.noise.get(index, self.agent.weights_total_size)
            for index in noise_indices
        ]

        # Update the parameters of the model.
        self.agent.learn(proc_noisy_rewards, noises)
        self.latest_flat_weights = self.agent.get_flat_weights()

        # Update obs filter
        self._update_filter()

        # Store the evaluate rewards
        if len(all_eval_rewards) > 0:
            self.eval_rewards_stat.add(np.mean(eval_rewards))
            self.eval_lengths_stat.add(np.mean(eval_lengths))

        metrics = {
            "episodes_this_iter": noisy_lengths.size,
            "sample_total_episodes": self.sample_total_episodes,
            'sample_total_steps': self.sample_total_steps,
            "evaluate_rewards_mean": self.eval_rewards_stat.mean,
            "evaluate_steps_mean": self.eval_lengths_stat.mean,
            "timesteps_this_iter": noisy_lengths.sum(),
        }

        self.log_metrics(metrics)
        return metrics

    def _update_filter(self):
        # Send get_filter signal to all actors
        for q in self.actors_signal_input_queues:
            q.put({'signal': 'get_filter'})

        filters = []
        # Collect filters from  all actors and update global filter
        for q in self.actors_output_queues:
            actor_filter = q.get()
            self.obs_filter.apply_changes(actor_filter)

        # Send set_filter signal to all actors
        self.latest_obs_filter = self.obs_filter.as_serializable()
        for q in self.actors_signal_input_queues:
            q.put({'signal': 'set_filter'})

    def log_metrics(self, metrics):
        logger.info(metrics)
        for k, v in metrics.items():
            if v is not None:
                summary.add_scalar(k, v, self.sample_total_steps)
Exemplo n.º 8
0
class Actor(object):
    def __init__(self, config):
        self.config = config

        self.env = gym.make(self.config['env_name'])
        self.config['obs_dim'] = self.env.observation_space.shape[0]
        self.config['act_dim'] = self.env.action_space.shape[0]

        self.obs_filter = MeanStdFilter(self.config['obs_dim'])
        self.noise = SharedNoiseTable(self.config['noise_size'])

        model = MujocoModel(self.config['act_dim'])
        algorithm = ES(model)
        self.agent = MujocoAgent(algorithm, self.config)

    def _play_one_episode(self, add_noise=False):
        episode_reward = 0
        episode_step = 0

        obs = self.env.reset()
        while True:
            if np.random.uniform() < self.config['filter_update_prob']:
                obs = self.obs_filter(obs[None], update=True)
            else:
                obs = self.obs_filter(obs[None], update=False)

            action = self.agent.predict(obs)
            if add_noise:
                action += np.random.randn(
                    *action.shape) * self.config['action_noise_std']

            obs, reward, done, _ = self.env.step(action)
            episode_reward += reward
            episode_step += 1
            if done:
                break
        return episode_reward, episode_step

    def sample(self, flat_weights):
        noise_indices, rewards, lengths = [], [], []
        eval_rewards, eval_lengths = [], []

        # Perform some rollouts with noise.
        task_tstart = time.time()
        while (len(noise_indices) == 0
               or time.time() - task_tstart < self.config['min_task_runtime']):

            if np.random.uniform() < self.config["eval_prob"]:
                # Do an evaluation run with no perturbation.
                self.agent.set_flat_weights(flat_weights)
                episode_reward, episode_step = self._play_one_episode(
                    add_noise=False)
                eval_rewards.append(episode_reward)
                eval_lengths.append(episode_step)
            else:
                # Do a regular run with parameter perturbations.
                noise_index = self.noise.sample_index(
                    self.agent.weights_total_size)

                perturbation = self.config["noise_stdev"] * self.noise.get(
                    noise_index, self.agent.weights_total_size)

                # mirrored sampling: evaluate pairs of perturbations \epsilon, −\epsilon
                self.agent.set_flat_weights(flat_weights + perturbation)
                episode_reward_pos, episode_step_pos = self._play_one_episode(
                    add_noise=True)

                self.agent.set_flat_weights(flat_weights - perturbation)
                episode_reward_neg, episode_step_neg = self._play_one_episode(
                    add_noise=True)

                noise_indices.append(noise_index)
                rewards.append([episode_reward_pos, episode_reward_neg])
                lengths.append([episode_step_pos, episode_step_neg])

        return {
            'noise_indices': noise_indices,
            'noisy_rewards': rewards,
            'noisy_lengths': lengths,
            'eval_rewards': eval_rewards,
            'eval_lengths': eval_lengths
        }

    def get_filter(self, flush_after=False):
        return_filter = self.obs_filter.as_serializable()
        if flush_after:
            self.obs_filter.clear_buffer()
        return return_filter

    def set_filter(self, new_filter):
        self.obs_filter.sync(new_filter)
Exemplo n.º 9
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_env(args.env_name, args.seed, args.gamma)

    model = MujocoModel(envs.observation_space.shape[0],
                        envs.action_space.shape[0])
    model.to(device)

    algorithm = PPO(model,
                    args.clip_param,
                    args.value_loss_coef,
                    args.entropy_coef,
                    initial_lr=args.lr,
                    eps=args.eps,
                    max_grad_norm=args.max_grad_norm)

    agent = MujocoAgent(algorithm, device)

    rollouts = RolloutStorage(args.num_steps, envs.observation_space.shape[0],
                              envs.action_space.shape[0])

    obs = envs.reset()
    rollouts.obs[0] = np.copy(obs)

    episode_rewards = deque(maxlen=10)

    num_updates = int(args.num_env_steps) // args.num_steps
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(algorithm.optimizer, j, num_updates,
                                         args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob = agent.sample(
                    rollouts.obs[step])  # why use obs from rollouts???有病吧

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.append(obs, action, action_log_prob, value, reward, masks,
                            bad_masks)

        with torch.no_grad():
            next_value = agent.value(rollouts.obs[-1])

        value_loss, action_loss, dist_entropy = agent.learn(
            next_value, args.gamma, args.gae_lambda, args.ppo_epoch,
            args.num_mini_batch, rollouts)

        rollouts.after_update()

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_steps
            print(
                "Updates {}, num timesteps {},\n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps, len(episode_rewards),
                        np.mean(episode_rewards), np.median(episode_rewards),
                        np.min(episode_rewards), np.max(episode_rewards),
                        dist_entropy, value_loss, action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            eval_mean_reward = evaluate(agent, ob_rms, args.env_name,
                                        args.seed, device)