Exemplo n.º 1
0
    def __init__(self, id, state_size, action_size, seed, memory, num_agents,
                 hyperparameters: Mapping[str, float]):
        """Initialize a DDPG agent object.
        
        Params
        ======
            id (int): agent's id
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            memory (ReplayBuffer): replay buffer to store the experience of this agent
            hyperparameters (dictionnary of str:): hyperparameters' values of the model. The expected parameters are:
             - batch_size (int): minibatch size
             - lr_actor (float): learning rate of the actor 
             - lr_critic (float): learning rate of the critic 
             - gamma (float): discount factor
             - weight_decay (float): critic L2 weight decay 
             - tau (float): value for soft update of target parameters
             - update_frequency (int): how much steps must be executed before starting learn
             - n_learns (int): how many learning for update 
        """
        self.id = id
        self.__name__ = 'DDPG'
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = hyperparameters['gamma']
        self.batch_size = int(hyperparameters['batch_size'])
        self.tau = hyperparameters['tau']

        self.update_frequency = int(hyperparameters['update_frequency'])
        self.n_learns = int(hyperparameters['n_learns'])

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=hyperparameters['lr_actor'])

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, num_agents,
                                   seed).to(device)
        self.critic_target = Critic(state_size, action_size, num_agents,
                                    seed).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=hyperparameters['lr_critic'],
            weight_decay=hyperparameters['weight_decay'])

        # Noise process
        self.noise = Ornstein(action_size)

        # Replay memory
        self.memory = memory

        # Initialize the time step (for every update_frequency steps)
        self.t_step = 0
Exemplo n.º 2
0
    def __init__(self,
                 env,
                 gamma,
                 tau,
                 buffer_maxlen,
                 critic_learning_rate,
                 actor_learning_rate,
                 max_action=1):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.noise = OUNoise(env.action_space)
        self.iter = 0.0
        self.noisy = False
        self.max_action = max_action

        print(self.action_dim)
        print(self.obs_dim)

        # RL hyperparameters
        self.env = env
        self.gamma = gamma
        self.tau = tau

        # Initialize critic and actorr networks
        self.critic = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic_target = Critic(self.obs_dim,
                                    self.action_dim).to(self.device)

        self.actor = Actor(self.obs_dim, self.action_dim,
                           self.max_action).to(self.device)
        self.actor_target = Actor(self.obs_dim,
                                  self.action_dim).to(self.device)

        # Copy target network paramters for critic
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        # Set Optimization algorithms
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_learning_rate)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=actor_learning_rate)

        self.replay_buffer = ExperienceReplayLog(buffer_maxlen)
Exemplo n.º 3
0
def train(env_id, *args, **kwargs):
    """Train a DDPG model on the given environment."""
    env = gym.make(env_id)

    [obs_dim] = env.observation_space.shape
    [act_dim] = env.action_space.shape

# tf global variables are created here so they will be initialized by
#  @run_with_sess when we call train_with
    critic = Critic(obs_dim, act_dim)
    actor = Actor(obs_dim, act_dim, critic)

    noise = ornstein_uhlenbeck_noise(np.zeros(act_dim))

    return train_with(env, actor, critic, noise, *args, **kwargs)
Exemplo n.º 4
0
    def __init__(self, logger, obs_dim, action_space, userconfig):
        super().__init__(logger=logger,
                         obs_dim=obs_dim,
                         action_dim=action_space.shape[0],
                         userconfig=userconfig)

        self._observation_dim = obs_dim
        self._action_space = action_space
        self._action_n = action_space.shape[0]
        self._config = {
            "eps": 0.05,
            "discount": 0.95,
            "buffer_size": int(1e5),
            "batch_size": 128,
            "learning_rate_actor": 0.0002,
            "learning_rate_critic": 0.0002,
            "hidden_sizes": [256, 256],
            'tau': 0.0001
        }

        self._config.update(userconfig)
        self._eps = self._config['eps']
        self._tau = self._config['tau']
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')

        self.eval_mode = False

        if self._config['lr_milestones'] is None:
            raise ValueError(
                'lr_milestones argument cannot be None!\nExample: --lr_milestones=100 200 300'
            )

        lr_milestones = [
            int(x) for x in (self._config['lr_milestones'][0]).split(' ')
        ]

        # Critic
        self.critic = Critic(
            self._observation_dim,
            self._action_n,
            hidden_sizes=self._config['hidden_sizes'],
            learning_rate=self._config['learning_rate_critic'],
            lr_milestones=lr_milestones,
            lr_factor=self._config['lr_factor'],
            device=self._config['device'])
        self.critic_target = Critic(
            self._observation_dim,
            self._action_n,
            hidden_sizes=self._config['hidden_sizes'],
            learning_rate=self._config['learning_rate_critic'],
            lr_milestones=lr_milestones,
            lr_factor=self._config['lr_factor'],
            device=self._config['device'])

        # Actor
        self.actor = Actor(self._observation_dim,
                           self._action_n,
                           hidden_sizes=self._config['hidden_sizes'],
                           learning_rate=self._config['learning_rate_actor'],
                           lr_milestones=lr_milestones,
                           lr_factor=self._config['lr_factor'],
                           device=self._config['device'])
        self.actor_target = Actor(
            self._observation_dim,
            self._action_n,
            hidden_sizes=self._config['hidden_sizes'],
            learning_rate=self._config['learning_rate_actor'],
            lr_milestones=lr_milestones,
            lr_factor=self._config['lr_factor'],
            device=self._config['device'])
Exemplo n.º 5
0
class TD3Agent(Agent):
    """
        The TD3Agent class implements a trainable TD3 agent.

        Parameters
        ----------

        logger: Logger
            The variable specifies a logger for model management, plotting and printing.
        obs_dim: int
            The variable specifies the dimension of observation space vector.
         action_space: ndarray
            The variable specifies the action space of environment.
        userconfig:
            The variable specifies the config settings.
        """
    def __init__(self, logger, obs_dim, action_space, userconfig):
        super().__init__(logger=logger,
                         obs_dim=obs_dim,
                         action_dim=action_space.shape[0],
                         userconfig=userconfig)

        self._observation_dim = obs_dim
        self._action_space = action_space
        self._action_n = action_space.shape[0]
        self._config = {
            "eps": 0.05,
            "discount": 0.95,
            "buffer_size": int(1e5),
            "batch_size": 128,
            "learning_rate_actor": 0.0002,
            "learning_rate_critic": 0.0002,
            "hidden_sizes": [256, 256],
            'tau': 0.0001,
            'noise': 0.2,
            'noise_clip': 0.5
        }

        self._config.update(userconfig)
        self._eps = self._config['eps']
        self._tau = self._config['tau']
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.eval_mode = False

        if self._config['lr_milestones'] is None:
            raise ValueError(
                'lr_milestones argument cannot be None!\nExample: --lr_milestones=100 200 300'
            )

        lr_milestones = [
            int(x) for x in (self._config['lr_milestones'][0]).split(' ')
        ]

        # Critics
        self.critics = TwinCritic(
            self._observation_dim,
            self._action_n,
            hidden_sizes=self._config['hidden_sizes'],
            learning_rate=self._config['learning_rate_critic'],
            lr_milestones=lr_milestones,
            lr_factor=self._config['lr_factor'],
            device=self._config['device'])

        self.critics_target = TwinCritic(
            self._observation_dim,
            self._action_n,
            hidden_sizes=self._config['hidden_sizes'],
            learning_rate=self._config['learning_rate_critic'],
            lr_milestones=lr_milestones,
            lr_factor=self._config['lr_factor'],
            device=self._config['device'])

        # Actor
        self.actor = Actor(self._observation_dim,
                           self._action_n,
                           hidden_sizes=self._config['hidden_sizes'],
                           learning_rate=self._config['learning_rate_actor'],
                           lr_milestones=lr_milestones,
                           lr_factor=self._config['lr_factor'],
                           device=self._config['device'])
        self.actor_target = Actor(
            self._observation_dim,
            self._action_n,
            hidden_sizes=self._config['hidden_sizes'],
            learning_rate=self._config['learning_rate_actor'],
            lr_milestones=lr_milestones,
            lr_factor=self._config['lr_factor'],
            device=self._config['device'])

    def eval(self):
        self.eval_mode = True

    def train_mode(self):
        self.eval_mode = False

    def act(self, observation, noise=0, evaluation=False):
        state = torch.from_numpy(observation).float().to(self.device)
        action = self.actor.forward(state)
        action = action.detach().cpu().numpy()[0]

        if noise != 0 and not evaluation:
            action = (action +
                      np.random.normal(0, noise, size=action.shape[0]))
        return action.clip(-1, 1)

    def schedulers_step(self):
        self.critics.lr_scheduler.step()
        self.critics_target.lr_scheduler.step()
        self.actor.lr_scheduler.step()
        self.actor_target.lr_scheduler.step()

    def store_transition(self, transition):
        self.buffer.add_transition(transition)

    @staticmethod
    def load_model(fpath):
        with open(Path(fpath), 'rb') as inp:
            return pickle.load(inp)

    def train(self, total_step_counter, iter_fit=32):
        losses = []

        for i in range(iter_fit):
            data = self.buffer.sample(batch_size=self._config['batch_size'])
            s = torch.FloatTensor(np.stack(data[:, 0])).to(self.device)

            s_next = torch.FloatTensor(np.stack(data[:, 3])).to(self.device)
            a = torch.FloatTensor(np.stack(
                data[:, 1])[:, None]).squeeze(dim=1).to(self.device)

            rew = torch.FloatTensor(np.stack(
                data[:, 2])[:, None]).squeeze(dim=1).to(self.device)

            done = torch.FloatTensor(np.stack(
                data[:, 4])[:,
                            None]).squeeze(dim=1).to(self.device)  # done flag

            noise = torch.FloatTensor(a.cpu()).data.normal_(
                0, self._config['noise']).to(self.device)
            noise = noise.clamp(-self._config['noise_clip'],
                                self._config['noise_clip'])
            a_next = (self.actor_target(s_next).to(self.device) + noise).clamp(
                -1, 1)

            Q1_target, Q2_target = self.critics_target(s_next, a_next)
            target_Q = torch.min(Q1_target,
                                 Q2_target).squeeze(dim=1).to(self.device)

            # target

            targets = rew + self._config['discount'] * target_Q * (1.0 - done)

            # optimize critic
            targets = targets.to(self.device)
            Q1_current, Q2_current = self.critics(s, a)
            Q1_current = Q1_current.squeeze(dim=1).to(self.device)
            Q2_current = Q2_current.squeeze(dim=1).to(self.device)
            critic_loss = F.mse_loss(Q1_current, targets) + F.mse_loss(
                Q2_current, targets)

            losses.append(critic_loss)
            self.critics.optimizer.zero_grad()
            critic_loss.backward()
            self.critics.optimizer.step()

            if ((total_step_counter - 1) * iter_fit + i +
                    1) % self._config['update_target_every'] == 0:
                # optimize actor
                actions = self.actor.forward(s)
                actor_loss = -self.critics.Q1(s, actions).mean()
                self.actor.optimizer.zero_grad()
                actor_loss.backward()
                self.actor.optimizer.step()
                # update

                soft_update(self.critics_target, self.critics, self._tau)
                soft_update(self.actor_target, self.actor, self._tau)

        return losses
def retraining(
        save_path,
        network,
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=4,  #50
        nb_rollout_steps=3,  #100
        reward_scale=1.0,
        render=False,
        render_eval=False,
        #   noise_type='adaptive-param_0.2',
        noise_type='normal_0.2',
        #   noise_type='ou_0.9',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-4,
        #   actor_lr=1e-6,
        #   critic_lr=1e-5,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=3,  # per epoch cycle and MPI worker,  50
        nb_eval_steps=1,  #100
        batch_size=640,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=3,  #50
        **network_kwargs):

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles *
                                             nb_rollout_steps)
    else:
        nb_epochs = 500

    rank = MPI.COMM_WORLD.Get_rank()
    # nb_actions = env.action_space.shape[-1]
    nb_actions = env.num_actions

    # nb_actions=3
    # print(nb_actions)
    action_shape = np.array(nb_actions * [0]).shape

    #4 pairs pos + 3 link length
    # nb_features = 2*(env.num_actions+1)+env.num_actions

    #4 pairs pos + 1 pair target pos
    nb_features = 2 * (env.num_actions + 2)
    observation_shape = np.array(nb_features * [0]).shape
    # assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.

    # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    memory = Memory(limit=int(1e6),
                    action_shape=action_shape,
                    observation_shape=observation_shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    # nb_actions = env.action_space.shape[-1]
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
    agent = DDPG(actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    agent.initialize(sess)
    # sess.graph.finalize()

    agent.reset()

    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar
    step_set = []
    reward_set = []

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    mean_epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    #load the initialization policy
    agent.load_ini(sess, save_path)
    # agent.memory.clear(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape)
    for epoch in range(nb_epochs):
        print(nb_epochs)
        # obs, env_state = env.reset()
        obs = env.reset()
        agent.save(save_path)
        epoch_episode_rewards = []
        '''check if the actor initialization policy has been loaded correctly, 
        i.e. equal to directly ouput values in checkpoint files '''
        # loaded_weights=tf.get_default_graph().get_tensor_by_name('target_actor/mlp_fc0/w:0')
        # print('loaded_weights:', sess.run(loaded_weights))
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.

            for t_rollout in range(nb_rollout_steps):
                # Predict next action
                action, q, _, _ = agent.step(obs,
                                             apply_noise=True,
                                             compute_Q=True)
                print('action:', action)

                new_obs, r, done = env.step(action)
                # time.sleep(0.2)
                t += 1

                episode_reward += r
                episode_step += 1
                # print('episode_re: ', episode_reward) #[1.]

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                b = 1.
                agent.store_transition(
                    obs, action, r, new_obs, done
                )  #the batched data will be unrolled in memory.py's append.

                obs = new_obs

            epoch_episode_rewards.append(episode_reward)
            episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)
                # print('Train!')
                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        eval_action)
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0

        mpi_size = MPI.COMM_WORLD.Get_size()
        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards))
        # print(step_set,mean_epoch_episode_rewards)
        step_set.append(t)
        plt.plot(step_set,
                 mean_epoch_episode_rewards,
                 color='r',
                 label='Initialization')
        plt.xlabel('Steps')
        plt.ylabel('Mean Episode Reward')
        plt.savefig('ddpg_mean_retrain.png')
        # plt.show()

        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = MPI.COMM_WORLD.allreduce(
            np.array(
                [np.array(x).flatten()[0] for x in combined_stats.values()]))
        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'),
                          'wb') as f:
                    pickle.dump(eval_env.get_state(), f)
    print('stepset: ', step_set)
    print('rewards: ', mean_epoch_episode_rewards)

    return agent
Exemplo n.º 7
0
class DDPGAgent(object):
    """Interacts with and learns from the environment."""
    def __init__(self, id, state_size, action_size, seed, memory, num_agents,
                 hyperparameters: Mapping[str, float]):
        """Initialize a DDPG agent object.
        
        Params
        ======
            id (int): agent's id
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            memory (ReplayBuffer): replay buffer to store the experience of this agent
            hyperparameters (dictionnary of str:): hyperparameters' values of the model. The expected parameters are:
             - batch_size (int): minibatch size
             - lr_actor (float): learning rate of the actor 
             - lr_critic (float): learning rate of the critic 
             - gamma (float): discount factor
             - weight_decay (float): critic L2 weight decay 
             - tau (float): value for soft update of target parameters
             - update_frequency (int): how much steps must be executed before starting learn
             - n_learns (int): how many learning for update 
        """
        self.id = id
        self.__name__ = 'DDPG'
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = hyperparameters['gamma']
        self.batch_size = int(hyperparameters['batch_size'])
        self.tau = hyperparameters['tau']

        self.update_frequency = int(hyperparameters['update_frequency'])
        self.n_learns = int(hyperparameters['n_learns'])

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=hyperparameters['lr_actor'])

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, num_agents,
                                   seed).to(device)
        self.critic_target = Critic(state_size, action_size, num_agents,
                                    seed).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=hyperparameters['lr_critic'],
            weight_decay=hyperparameters['weight_decay'])

        # Noise process
        self.noise = Ornstein(action_size)

        # Replay memory
        self.memory = memory

        # Initialize the time step (for every update_frequency steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done, other_states,
             other_actions, other_next_states):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.memory.add(state, action, reward, next_state, done, other_states,
                        other_actions, other_next_states)

        self.t_step = (self.t_step + 1) % self.update_frequency
        if self.t_step == 0:
            # Learn, if enough samples are available in memory
            for _ in range(self.n_learns):
                if len(self.memory) > self.batch_size:
                    experiences = self.memory.sample(self.batch_size)
                    self.learn(experiences, self.gamma)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, _, _, _, _, other_states, _, _ = experiences

        self.update_critic(experiences, gamma)
        self.update_actor(states, other_states)
        self.update_target_networks()

    def update_critic(self, experiences, gamma):
        """Update the critic network given the experiences"""

        states, actions, rewards, next_states, dones, other_states, other_actions, other_next_states = experiences

        all_states = torch.cat((states, other_states), dim=1).to(device)
        all_actions = torch.cat((actions, other_actions), dim=1).to(device)
        all_next_states = torch.cat((next_states, other_next_states),
                                    dim=1).to(device)

        local_all_next_actions = []
        local_all_next_actions.append(self.actor_target(states))
        local_all_next_actions.append(self.actor_target(other_states))
        all_next_actions = torch.cat(local_all_next_actions, dim=1).to(device)

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        Q_targets_next = self.critic_target(all_next_states, all_next_actions)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(all_states, all_actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

    def update_actor(self, states, other_states):

        all_states = torch.cat((states, other_states), dim=1).to(device)

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        other_actions_pred = self.actor_local(other_states)
        other_actions_pred = other_actions_pred.detach()

        actions_pred = torch.cat((actions_pred, other_actions_pred),
                                 dim=1).to(device)
        actor_loss = -self.critic_local(all_states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

    def update_target_networks(self):

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 8
0
def testing(
        save_path,
        network,
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=50,
        nb_rollout_steps=3,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        # no noise for test
        #   noise_type='adaptive-param_0.2',
        #   noise_type='normal_0.9',
        #   noise_type='ou_0.9',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        #   actor_lr=1e-6,
        #   critic_lr=1e-5,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=3,  # per epoch cycle and MPI worker,  50
        nb_eval_steps=1,
        batch_size=64,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=3,  #
        **network_kwargs):

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles *
                                             nb_rollout_steps)
    else:
        nb_epochs = 500

    rank = MPI.COMM_WORLD.Get_rank()
    # nb_actions = env.action_space.shape[-1]
    # nb_actions = 2*env.grid_size
    nb_actions = env.grid_size
    action_shape = np.array(nb_actions * [0]).shape
    nb_features = (4 + 1) * env.grid_size
    observation_shape = np.array(nb_features * [0]).shape
    grid_x = env.grid_x
    grid_y = env.grid_y
    x = []
    y = []
    for i in range(grid_x):
        x.append(i + 1)
    for i in range(grid_y):
        y.append(i + 1)
    # assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.

    # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    memory = Memory(limit=int(1e6),
                    action_shape=action_shape,
                    observation_shape=observation_shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    '''no noise for test'''
    # if noise_type is not None:
    #     for current_noise_type in noise_type.split(','):
    #         current_noise_type = current_noise_type.strip()
    #         if current_noise_type == 'none':
    #             pass
    #         elif 'adaptive-param' in current_noise_type:
    #             _, stddev = current_noise_type.split('_')
    #             param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
    #         elif 'normal' in current_noise_type:
    #             _, stddev = current_noise_type.split('_')
    #             action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
    #         elif 'ou' in current_noise_type:
    #             _, stddev = current_noise_type.split('_')
    #             action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
    #         else:
    #             raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # max_action = env.action_space.high
    # logger.info('scaling actions by {} before executing in env'.format(max_action))

    # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
    agent = DDPG(actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    # agent.initialize(sess)
    # sess.graph.finalize()
    agent.load(sess, save_path)

    agent.reset()

    obs, env_state = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar
    step_set = []
    reward_set = []

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    average_reward = []
    mean_epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_state = []
    epoch_episodes = 0
    #record the car numbers in each step
    car_num_set = {}
    t_set = [i for i in range(total_timesteps)]
    for xx in x:
        for yy in y:
            lab = str(xx) + str(yy)
            car_num_set[lab] = [[0 for i in range(total_timesteps)]
                                for j in range(4)]

    for epoch in range(nb_epochs):
        obs, env_state = env.reset()
        epoch_actions = []
        epoch_state = []
        average_car_num_set = []
        last_action = 1
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            action, q, _, _ = agent.step(obs,
                                         apply_noise=False,
                                         compute_Q=True)
            '''random action'''
            # if np.random.rand()>0.5:
            #     action=[1]
            # else:
            #     action=[0]
            '''cycle light state'''
            # action=[0]
            '''cycle action (should cycle state instead of action)'''
            # if last_action==1:
            #     action=[0]
            # else:
            #     action=[1]
            # last_action=action[0]

            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                # of the environments, so resetting here instead
                agent.reset()
            for t_rollout in range(nb_rollout_steps):
                new_obs, r, env_state, done = env.step(action, env_state)
                epoch_state.append(env_state['11'].light_state)
                for xx in x:
                    for yy in y:
                        lab = str(xx) + str(yy)
                        for i in range(4):
                            car_num_set[lab][i][t] = (
                                env_state['11'].car_nums[i])
                t += 1
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                b = 1.
                agent.store_transition(
                    obs, action, r, new_obs, done
                )  #the batched data will be unrolled in memory.py's append.
                obs = new_obs

                for d in range(len(done)):
                    if done[d]:
                        print('done')
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward[d])
                        episode_rewards_history.append(episode_reward[d])
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1
                        if nenvs == 1:
                            agent.reset()

            epoch_episode_rewards.append(episode_reward)
            average_reward.append(episode_reward / nb_rollout_steps)

            episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            # for t_train in range(nb_train_steps):
            #     # Adapt param noise, if necessary.
            #     if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
            #         distance = agent.adapt_param_noise()
            #         epoch_adaptive_distances.append(distance)
            #     # print('Train!')
            #     cl, al = agent.train()
            #     epoch_critic_losses.append(cl)
            #     epoch_actor_losses.append(al)
            #     agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        eval_action)
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0
            step_set.append(t)

        mpi_size = MPI.COMM_WORLD.Get_size()
        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards))
        # print(step_set,mean_epoch_episode_rewards)
        # plt.figure(figsize=(8,5))
        '''plot rewards-steps'''
        ax1 = plt.subplot(2, 1, 1)
        plt.sca(ax1)
        plt.plot(step_set, average_reward, color='b')
        # plt.xlabel('Steps')
        plt.ylabel('Mean Reward', fontsize=12)
        # plt.ylim(-15000,0)
        '''plot queueing car numbers-steps'''
        ax2 = plt.subplot(2, 1, 2)
        plt.sca(ax2)
        print(np.shape(t_set), np.shape(car_num_set['11'][i]))
        for i in range(4):
            if i == 0:
                plt.plot(t_set, car_num_set['11'][i], '--', label=i, color='b')
            elif i == 1:
                plt.plot(t_set,
                         car_num_set['11'][i],
                         '--',
                         label=i,
                         color='orange')
            elif i == 2:
                plt.plot(t_set, car_num_set['11'][i], label=i, color='g')
            else:
                plt.plot(t_set, car_num_set['11'][i], label=i, color='r')
        plt.ylim(0, 100)
        #sum among roads
        sum_car_num = np.sum(car_num_set['11'], axis=0)
        #average among time steps
        average_car_num = np.average(sum_car_num)
        average_car_num_set.append(average_car_num)

        plt.xlabel('Steps', fontsize=12)
        plt.ylabel('Cars Numbers', fontsize=12)
        # set legend
        handles, labels = plt.gca().get_legend_handles_labels()
        by_label = OrderedDict(zip(labels, handles))
        leg = plt.legend(by_label.values(), by_label.keys(), loc=1)
        # leg = plt.legend(loc=4)
        legfm = leg.get_frame()
        legfm.set_edgecolor('black')  # set legend fame color
        legfm.set_linewidth(0.5)  # set legend fame linewidth
        plt.savefig('ddpg_mean_test.pdf')
        plt.show()
        print(epoch_state)

        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = MPI.COMM_WORLD.allreduce(
            np.array(
                [np.array(x).flatten()[0] for x in combined_stats.values()]))
        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'),
                          'wb') as f:
                    pickle.dump(eval_env.get_state(), f)
    print('average queueing car numbers: ', np.average(average_car_num_set))

    return agent
Exemplo n.º 9
0
def testing(save_path, network, env,
          seed=None,
          total_timesteps=None,
          nb_epochs=None, # with default settings, perform 1M steps total
          nb_epoch_cycles=50,
          nb_rollout_steps=3,  #100
          reward_scale=1.0,
          render=False,
          render_eval=False,
          # no noise for test
        #   noise_type='adaptive-param_0.2',
        #   noise_type='normal_0.9',
        #   noise_type='ou_0.9',

          normalize_returns=False,
          normalize_observations=True,
          critic_l2_reg=1e-2,
          actor_lr=1e-4,
          critic_lr=1e-3,
        #   actor_lr=1e-6,
        #   critic_lr=1e-5,
          popart=False,
          gamma=0.99,
          clip_norm=None,
          nb_train_steps=3, # per epoch cycle and MPI worker,  50
          nb_eval_steps=1,  #100
          batch_size=640, # per MPI worker
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=3, #50
          **network_kwargs):


    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps)
    else:
        nb_epochs = 500

    rank = MPI.COMM_WORLD.Get_rank()
    # nb_actions = env.action_space.shape[-1]
    nb_actions = env.num_actions

    # nb_actions=3
    # print(nb_actions)
    action_shape=np.array(nb_actions*[0]).shape

    nb_features = 2*(env.num_actions+1)+env.num_actions
    observation_shape=np.array(nb_features*[0]).shape
    # assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.

    # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    # nb_actions = env.action_space.shape[-1]
    '''no noise for test'''
    # if noise_type is not None:
    #     for current_noise_type in noise_type.split(','):
    #         current_noise_type = current_noise_type.strip()
    #         if current_noise_type == 'none':
    #             pass
    #         elif 'adaptive-param' in current_noise_type:
    #             _, stddev = current_noise_type.split('_')
    #             param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
    #         elif 'normal' in current_noise_type:
    #             _, stddev = current_noise_type.split('_')
    #             action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
    #         elif 'ou' in current_noise_type:
    #             _, stddev = current_noise_type.split('_')
    #             action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
    #         else:
    #             raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # max_action = env.action_space.high
    # logger.info('scaling actions by {} before executing in env'.format(max_action))

    # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
    agent = DDPG(actor, critic, memory, observation_shape, action_shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    agent.load(sess,save_path)
    # sess.graph.finalize()  # cannot save sess if its finalized!

    agent.reset()

    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    episode_reward = np.zeros(nenvs, dtype = np.float32) #vector
    episode_step = np.zeros(nenvs, dtype = int) # vector
    episodes = 0 #scalar
    t = 0 # scalar
    step_set=[]
    reward_set=[]

    epoch = 0



    start_time = time.time()

    epoch_episode_rewards = []
    mean_epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        print(nb_epochs)
        # obs, env_state = env.reset()
        obs = env.reset()
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                # of the environments, so resetting here instead
                agent.reset()
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                '''no noise for test'''
                action, q, _, _ = agent.step(obs, apply_noise=False, compute_Q=True)
                # print('action:', action)

                # Execute next action.
                # if rank == 0 and render:
                #     env.render()

                # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch
                # new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                
                # new_obs, r, env_state,done = env.step(action, env_state)
                '''actually no need for env_state: in or out'''
                new_obs, r, done = env.step(action)


                # print('reward:', r)
                # note these outputs are batched from vecenv
                # print('obs: ',obs.shape,obs, 'action: ', action.shape, action )
                '''obs shape: (1,17), action shape: (1,6)'''
                # print('maxaction: ', max_action.shape)
                '''max_action shape: (6,) , max_action*action shape: (1,6)'''
                t += 1
                # if rank == 0 and render:
                #     env.render()
                # print('r:', r)
                episode_reward += r
                episode_step += 1
                # print('episode_re: ', episode_reward) #[1.]

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                b=1.
                agent.store_transition(obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append.
                # print('r: ', r)
                # '''r shape: (1,)'''
                obs = new_obs

                # for d in range(len(done)):
                #     if done[d]:
                #         print('done')
                #         # Episode done.
                #         epoch_episode_rewards.append(episode_reward[d])
                #         episode_rewards_history.append(episode_reward[d])
                #         epoch_episode_steps.append(episode_step[d])
                #         episode_reward[d] = 0.
                #         episode_step[d] = 0
                #         epoch_episodes += 1
                #         episodes += 1
                #         if nenvs == 1:
                #             agent.reset()

            '''added'''                
            epoch_episode_rewards.append(episode_reward)
            '''
            step_set.append(t)
            reward_set=np.concatenate((reward_set,episode_reward))
            # print(step_set,reward_set)
            # print(t, episode_reward)
            
            plt.plot(step_set,reward_set)
            plt.xlabel('Steps')
            plt.ylabel('Episode Reward')
            plt.savefig('ddpg.png')
            plt.show()
            '''

            episode_reward = np.zeros(nenvs, dtype = np.float32) #vector

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            '''no training for test'''
            # for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary. no noise for test!
                # if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                #     distance = agent.adapt_param_noise()
                #     epoch_adaptive_distances.append(distance)

                # cl, al = agent.train()
                # epoch_critic_losses.append(cl)
                # epoch_actor_losses.append(al)
                # agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype = np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True)
                    # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action)
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0

        mpi_size = MPI.COMM_WORLD.Get_size()
        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards))
        # print(step_set,mean_epoch_episode_rewards)
        step_set.append(t)
        plt.plot(step_set,mean_epoch_episode_rewards)
        plt.xlabel('Steps')
        plt.ylabel('Mean Episode Reward')
        plt.savefig('ddpg_mean_test.png')
        # plt.show()

        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)
        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s'%x)

        combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([ np.array(x).flatten()[0] for x in combined_stats.values()]))
        combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                    pickle.dump(eval_env.get_state(), f)


    return agent
Exemplo n.º 10
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))
        env = Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                        sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
                   action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Exemplo n.º 11
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)

    # ---------- AMEND: specific setting for brsEngine -----------
    print("kwargs", kwargs)
    env.reward_type = kwargs['reward_type']
    env.set_additional_goal = kwargs['set_additional_goal']
    kwargs.pop('reward_type', None)
    kwargs.pop('set_additional_goal', None)
    brsEngine = None
    if env.reward_type == 'ttr':
        if env_id == 'DubinsCarEnv-v0':
            brsEngine = DubinsCar_brs_engine()
            brsEngine.reset_variables()
        elif env_id == 'PlanarQuadEnv-v0':
            brsEngine = Quadrotor_brs_engine()
            brsEngine.reset_variables()
        else:
            raise ValueError("invalid environment name for ttr reward!")
        # You have to assign the engine!
        env.brsEngine = brsEngine
    # -----------------------------------------------------------

    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        # ---------- AMEND: specific setting for brsEngine -----------
        eval_env.brsEngine = brsEngine
        # ------------------------------------------------------------

        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()

    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   **kwargs)
    env.close()

    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Exemplo n.º 12
0
    def __init__(self,
                 observation_shape,
                 action_shape,
                 nb_demo_kine,
                 nb_key_states,
                 batch_size=128,
                 noise_type='',
                 actor=None,
                 critic=None,
                 layer_norm=True,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 normalize_returns=False,
                 normalize_observations=True,
                 reward_scale=1.,
                 clip_norm=None,
                 demo_l2_reg=0.,
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 demo_lr=5e-3,
                 gamma=0.99,
                 tau=0.001,
                 enable_popart=False,
                 save_ckpt=True):

        # Noise
        nb_actions = action_shape[-1]
        param_noise, action_noise = process_noise_type(noise_type, nb_actions)

        logger.info('param_noise', param_noise)
        logger.info('action_noise', action_noise)

        # States recording
        self.memory = Memory(limit=int(2e5),
                             action_shape=action_shape,
                             observation_shape=observation_shape)

        # Models
        self.nb_demo_kine = nb_demo_kine
        self.actor = actor or Actor(
            nb_actions, nb_demo_kine, layer_norm=layer_norm)
        self.nb_key_states = nb_key_states
        self.critic = critic or Critic(nb_key_states, layer_norm=layer_norm)
        self.nb_obs_org = nb_key_states

        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        # self.critic_target_Q: value assigned by self.target_Q_obs0
        self.critic_target_Q = tf.placeholder(tf.float32,
                                              shape=(None, 1),
                                              name='critic_target_Q')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # change in observations
        self.obs_delta_kine = (self.obs1 - self.obs0)[:, :self.nb_demo_kine]
        self.obs_delta_kstates = (self.obs1 -
                                  self.obs0)[:, :self.nb_key_states]

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range

        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.demo_lr = demo_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.demo_l2_reg = demo_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None

        self.normalized_obs0 = tf.clip_by_value(
            obs_norm_partial(self.obs0, self.obs_rms, self.nb_obs_org),
            self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(
            obs_norm_partial(self.obs1, self.obs_rms, self.nb_obs_org),
            self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(self.actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(self.critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across set-up parts.
        # the actor output is [0,1], need to normalised to [-1,1] before feeding into critic
        self.actor_tf, self.demo_aprx = self.actor(self.normalized_obs0)

        # critic loss
        # normalized_critic_tf, pred_rwd, pred_obs_delta: critic_loss
        self.normalized_critic_tf, self.pred_rwd, self.pred_obs_delta = self.critic(
            self.normalized_obs0, act_norm(self.actions))
        # self.critic_tf: only in logging [reference_Q_mean/std]
        self.critic_tf = ret_denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)

        # actor loss
        normalized_critic_with_actor_tf = self.critic(self.normalized_obs0,
                                                      act_norm(self.actor_tf),
                                                      reuse=True)[0]
        # self.critic_with_actor_tf: actor loss, and logging [reference_Q_tf_mean/std]
        self.critic_with_actor_tf = ret_denormalize(
            tf.clip_by_value(normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)

        # target Q
        self.target_action = tf.clip_by_value(
            target_actor(normalized_obs1)[0], self.action_range[0],
            self.action_range[1])
        self.target_Q_obs1 = ret_denormalize(
            target_critic(normalized_obs1, act_norm(self.target_action))[0],
            self.ret_rms)
        self.target_Q_obs0 = self.rewards + (
            1. - self.terminals1) * gamma * self.target_Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(self.normalized_obs0)

        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()
        self.dbg_vars = self.actor.dbg_vars + self.critic.dbg_vars

        self.sess = None
        # Set up checkpoint saver
        self.save_ckpt = save_ckpt
        if save_ckpt:
            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=20)
        else:
            # saver for loading ckpt
            self.saver = tf.train.Saver()

        self.main_summaries = tf.summary.merge_all()
        logdir = logger.get_dir()
        if logdir:
            self.train_writer = tf.summary.FileWriter(
                os.path.join(logdir, 'tb'), tf.get_default_graph())
        else:
            self.train_writer = None
def learn(
        save_path,
        network,
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=7,  #50
        nb_rollout_steps=3,  #100
        reward_scale=1.0,
        render=False,
        render_eval=False,
        #   noise_type='adaptive-param_0.2',
        #   noise_type='normal_0.2',        # large noise
        #   noise_type='normal_0.02',       # small noise
        noise_type='normal_2.0',

        # action ranges 360, so noise scale should be chosen properly
        #   noise_type='normal_5',        # large noise
        #   noise_type='normal_0.2',       # small noise
        #   noise_type='normal_0.00001',      # no noise
        #   noise_type='ou_0.9',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,  # large lr
        critic_lr=1e-3,  # large lr
        #   actor_lr=1e-7,      # small lr
        #   critic_lr=1e-3,     # small lr
        #   actor_lr = 1e-10,    # no lr
        #   critic_lr=1e-10,     # no lr
    popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=3,  # per epoch cycle and MPI worker,  50
        nb_eval_steps=1,  #100
        batch_size=640,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=3,  #50
        **network_kwargs):

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles *
                                             nb_rollout_steps)
    else:
        nb_epochs = 500

    rank = MPI.COMM_WORLD.Get_rank()
    nb_actions = env.num_actions

    action_shape = np.array(nb_actions * [0]).shape

    #4 pairs pos + 3 link length
    # nb_features = 2*(env.num_actions+1)+env.num_actions

    #4 pairs pos + 1 pair target pos
    nb_features = 2 * (env.num_actions + 2)

    observation_shape = np.array(nb_features * [0]).shape
    # assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.

    # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    memory = Memory(limit=int(1e6),
                    action_shape=action_shape,
                    observation_shape=observation_shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    # nb_actions = env.action_space.shape[-1]
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    # max_action = env.action_space.high
    # logger.info('scaling actions by {} before executing in env'.format(max_action))

    # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
    agent = DDPG(actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    agent.initialize(sess)
    # sess.graph.finalize()

    agent.reset()

    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar
    step_set = []
    reward_set = []

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    mean_epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    episode_end_distance = []
    epoch_episodes = 0
    SPARSE_REWARD = False
    '''add this line to make non-initialized to be initialized'''
    agent.load_ini(sess, save_path)
    for epoch in range(nb_epochs):
        print('epochs: ', epoch)
        obs = env.reset()
        agent.save(save_path)
        epoch_episode_rewards = []

        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                # of the environments, so resetting here instead
                agent.reset()
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q, _, _ = agent.step(obs,
                                             apply_noise=True,
                                             compute_Q=True)
                # print('action:', action)

                if SPARSE_REWARD:
                    new_obs, r, done, end_distance = env.step(
                        action, SPARSE_REWARD)
                else:
                    new_obs, r, done = env.step(action, SPARSE_REWARD)

                t += 1

                episode_reward += r
                episode_step += 1
                # print('episode_re: ', episode_reward) #[1.]

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                b = 1.
                agent.store_transition(
                    obs, action, r, new_obs, done
                )  #the batched data will be unrolled in memory.py's append.
                # print('r: ', r)
                # '''r shape: (1,)'''
                obs = new_obs

            epoch_episode_rewards.append(episode_reward)
            if cycle == nb_epoch_cycles - 1:
                # record the distance from the end position of reacher to the goal for the last step of each episode
                if SPARSE_REWARD:
                    episode_end_distance.append(end_distance)
                else:
                    end_distance = 100.0 / r - 1
                    episode_end_distance.append(end_distance[0])

            episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []

            # filling memory with noised initialized policy & preupdate the critic networks
            preheating_step = 30  #50 episode = 600 steps, 12 steps per episode
            if epoch > preheating_step:
                # print('memory_entries: ',memory.nb_entries)
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)
                    # print('Train!')
                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()
            else:
                # update two critic networks at start
                cl = agent.update_critic()
                epoch_critic_losses.append(cl)
                print('critic loss in initial training: ', cl)
                pass

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        eval_action)
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0

        mpi_size = MPI.COMM_WORLD.Get_size()
        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards))
        # print(step_set,mean_epoch_episode_rewards)
        step_set.append(t)
        plt.figure(1)
        plt.plot(step_set, mean_epoch_episode_rewards)
        plt.xlabel('Steps')
        plt.ylabel('Mean Episode Reward')
        plt.savefig('ddpg_mean.png')

        plt.figure(2)
        plt.plot(step_set, episode_end_distance)
        plt.xlabel('Steps')
        plt.ylabel('Distance to Target')
        plt.savefig('ddpgini_distance.png')
        # plt.show()

        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = MPI.COMM_WORLD.allreduce(
            np.array(
                [np.array(x).flatten()[0] for x in combined_stats.values()]))
        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'),
                          'wb') as f:
                    pickle.dump(eval_env.get_state(), f)

    print('stepset: ', step_set)
    print('rewards: ', mean_epoch_episode_rewards)
    print('distances: ', episode_end_distance)

    return agent
Exemplo n.º 14
0
class DDPGAgent:
    def __init__(self,
                 env,
                 gamma,
                 tau,
                 buffer_maxlen,
                 critic_learning_rate,
                 actor_learning_rate,
                 max_action=1):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.noise = OUNoise(env.action_space)
        self.iter = 0.0
        self.noisy = False
        self.max_action = max_action

        print(self.action_dim)
        print(self.obs_dim)

        # RL hyperparameters
        self.env = env
        self.gamma = gamma
        self.tau = tau

        # Initialize critic and actorr networks
        self.critic = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic_target = Critic(self.obs_dim,
                                    self.action_dim).to(self.device)

        self.actor = Actor(self.obs_dim, self.action_dim,
                           self.max_action).to(self.device)
        self.actor_target = Actor(self.obs_dim,
                                  self.action_dim).to(self.device)

        # Copy target network paramters for critic
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        # Set Optimization algorithms
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_learning_rate)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=actor_learning_rate)

        self.replay_buffer = ExperienceReplayLog(buffer_maxlen)

    def get_action(self, obs):
        #print('obs;',obs)

        if self.noisy == True:
            state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
            action = self.actor.forward(state)
            action = action.squeeze(0).cpu().detach().numpy()
            action = self.noise.get_action(action, self.iter)
            self.iter = self.iter + 1

        else:
            state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
            action = self.actor.forward(state)
            action = action.squeeze(0).cpu().detach().numpy()

        return action

    def update(self, batch_size):

        #Batch updates
        states, actions, rewards, next_states, _ = self.replay_buffer.sample(
            batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample(
            batch_size)
        state_batch = torch.FloatTensor(state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        masks = torch.FloatTensor(masks).to(self.device)

        # Q info updates
        curr_Q = self.critic.forward(state_batch, action_batch)
        next_actions = self.actor_target.forward(next_state_batch)
        next_Q = self.critic_target.forward(next_state_batch,
                                            next_actions.detach())
        expected_Q = reward_batch + self.gamma * next_Q

        # Update Critic network
        q_loss = F.mse_loss(curr_Q, expected_Q.detach())

        self.critic_optimizer.zero_grad()
        q_loss.backward()

        self.critic_optimizer.step()

        # Update Actor network
        policy_loss = -self.critic.forward(
            state_batch, self.actor.forward(state_batch)).mean()

        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        # Update Actor and Critic target networks
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))

        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))