Пример #1
0
    def __init__(self, state_space, action_space, buffer_size, batch_size,learning_rate_actor, learning_rate_critic,update_rate, gamma, tau, device, seed, num_agents, epsilon, epsilon_decay, epsilon_min):
        self.num_agents = num_agents
        self.action_space = action_space
        self.state_space = state_space
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.step_count = 0.
        self.update_rate = update_rate
        self.tau = tau
        self.seed = seed
        self.device= device
        self.gamma = gamma
        self.actor_local_network = ActorNetwork(state_space, action_space, device, seed).to(device)
        self.actor_target_network = ActorNetwork(state_space, action_space, device, seed).to(device)
        self.critic_local_network = CriticNetwork(state_space, action_space, device, seed).to(device)
        self.critic_target_network = CriticNetwork(state_space, action_space, device, seed).to(device)
        
        
        self.actor_optimizer = torch.optim.Adam(self.actor_local_network.parameters(), lr=learning_rate_actor)
        self.critic_optimizer = torch.optim.Adam(self.critic_local_network.parameters(), lr=learning_rate_critic)
 
        self.noise = OUNoise(action_space, seed)
        self.memory = ReplayBuffer(buffer_size = self.buffer_size, batch_size=self.batch_size, 
                                   device=device, seed=seed)
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
    def __init__(self,
                 env_id,
                 action_space,
                 trajectory_size=256,
                 n_envs=1,
                 max_timesteps=1500):

        self.env_id = env_id

        self.n_envs = n_envs

        self.trajectory_size = trajectory_size

        self.vecenv = VecEnv(env_id=self.env_id,
                             n_envs=self.n_envs,
                             max_timesteps=max_timesteps)

        self.policy = PolicyNetwork(action_space=action_space)

        self.old_policy = PolicyNetwork(action_space=action_space)

        self.critic = CriticNetwork()

        self.r_running_stats = util.RunningStats(shape=(action_space, ))

        self._init_network()
Пример #3
0
    def __init__(self,
                 state_dim,
                 action_dim,
                 lr_actor=1e-4,
                 lr_critic=1e-4,
                 lr_decay=.95,
                 replay_buff_size=10000,
                 gamma=.99,
                 batch_size=128,
                 random_seed=42,
                 soft_update_tau=1e-3,
                 actor_layer_dim_1=128,
                 actor_layer_dim_2=128,
                 actor_layer_dim_3=0,
                 critic_layer_dim_1=128,
                 critic_layer_dim_2=64,
                 critic_layer_dim_3=0):
        """
        Initialize model
        """
        self.lr_actor = lr_actor
        self.gamma = gamma
        self.lr_critic = lr_critic
        self.lr_decay = lr_decay
        self.tau = soft_update_tau

        self.actor_local = ActorNetwork(state_dim, action_dim,
                                        actor_layer_dim_1, actor_layer_dim_2,
                                        actor_layer_dim_3).to(device=device)
        self.actor_target = ActorNetwork(state_dim, action_dim,
                                         actor_layer_dim_1, actor_layer_dim_2,
                                         actor_layer_dim_3).to(device=device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)

        self.critic_local = CriticNetwork(state_dim, action_dim,
                                          critic_layer_dim_1,
                                          critic_layer_dim_2,
                                          critic_layer_dim_3).to(device=device)
        self.critic_target = CriticNetwork(
            state_dim, action_dim, critic_layer_dim_1, critic_layer_dim_2,
            critic_layer_dim_3).to(device=device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic)

        self.noise = OUNoise(action_dim, random_seed)

        self.memory = ReplayBuffer(action_dim, replay_buff_size, batch_size,
                                   random_seed)
        self.path = ""
Пример #4
0
    def __init__(self, state_size, action_size,hd1_units=400, hd2_units=300 ,random_seed = 0, buffer_size = int(2e5), batch_size = 256, tau = 0.0005, actorLr =1e-3, criticLr = 1e-3, weight_decay = 0, update_every = 20, gamma = 0.99):
        """ :state_size (int): dimension of each state
            :action_size (int): dimension of each action
            :hd1_units (int) : number of the first hidden layer units
            :hd1_units (int) : number of the second hidden layer units
            :random_seed (int): random seed
            :buffer_size (int): replay buffer size
            :batch_size (int): batch size
            :tau (float): interpolation factor
            :actorLr (float): actor learning rate
            :criticLr (float): critic learning rate
            :weight_decay (float): Optimizer L2 penalty
            :update_every (int): learning frequency
            :gamma (float): Discount factor
        """
        self.state_size = state_size
        self.action_size = action_size
        self.update_every = update_every
        self.gamma = gamma
        self.tau = tau
        random.seed(random_seed)

        # Actor & Target Networks
        self.actor_local = ActorNetwork(state_size, action_size, random_seed, hd1_units, hd2_units).to(device)
        self.actor_target = ActorNetwork(state_size, action_size, random_seed, hd1_units, hd2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=actorLr, weight_decay = weight_decay)

        # Critic & Target Networks
        self.critic_local = CriticNetwork(state_size, action_size, random_seed, 400, 300).to(device)
        self.critic_target = CriticNetwork(state_size, action_size, random_seed, 400, 300).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=criticLr, weight_decay=weight_decay)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed)

        self.t_step = 0
Пример #5
0
    def __init__(self):

        self.env = gym.make(self.ENV_ID)

        self.env.max_episode_steps = 3000

        self.actor = ActorNetwork(action_space=self.ACTION_SPACE,
                                  max_action=self.MAX_ACTION)

        self.target_actor = ActorNetwork(action_space=self.ACTION_SPACE,
                                         max_action=self.MAX_ACTION)

        self.critic = CriticNetwork()

        self.target_critic = CriticNetwork()

        self.buffer = ReplayBuffer(max_experiences=self.MAX_EXPERIENCES)

        self.global_steps = 0

        self.hiscore = None

        self._build_networks()
Пример #6
0
class TD3Agent:

    MAX_EXPERIENCES = 30000

    MIN_EXPERIENCES = 300

    ENV_ID = "Pendulum-v0"

    ACTION_SPACE = 1

    MAX_ACTION = 2

    OBSERVATION_SPACE = 3

    CRITIC_UPDATE_PERIOD = 4

    POLICY_UPDATE_PERIOD = 8

    TAU = 0.02

    GAMMA = 0.99

    BATCH_SIZE = 64

    NOISE_STDDEV = 0.2

    def __init__(self):

        self.env = gym.make(self.ENV_ID)

        self.env.max_episode_steps = 3000

        self.actor = ActorNetwork(action_space=self.ACTION_SPACE,
                                  max_action=self.MAX_ACTION)

        self.target_actor = ActorNetwork(action_space=self.ACTION_SPACE,
                                         max_action=self.MAX_ACTION)

        self.critic = CriticNetwork()

        self.target_critic = CriticNetwork()

        self.buffer = ReplayBuffer(max_experiences=self.MAX_EXPERIENCES)

        self.global_steps = 0

        self.hiscore = None

        self._build_networks()

    def _build_networks(self):
        """パラメータの初期化
        """

        dummy_state = np.random.normal(0, 0.1, size=self.OBSERVATION_SPACE)
        dummy_state = (dummy_state[np.newaxis, ...]).astype(np.float32)

        dummy_action = np.random.normal(0, 0.1, size=self.ACTION_SPACE)
        dummy_action = (dummy_action[np.newaxis, ...]).astype(np.float32)

        self.actor.call(dummy_state)
        self.target_actor.call(dummy_state)
        self.target_actor.set_weights(self.actor.get_weights())

        self.critic.call(dummy_state, dummy_action, training=False)
        self.target_critic.call(dummy_state, dummy_action, training=False)
        self.target_critic.set_weights(self.critic.get_weights())

    def play(self, n_episodes):

        total_rewards = []

        recent_scores = collections.deque(maxlen=10)

        for n in range(n_episodes):

            total_reward, localsteps = self.play_episode()

            total_rewards.append(total_reward)

            recent_scores.append(total_reward)

            recent_average_score = sum(recent_scores) / len(recent_scores)

            print(f"Episode {n}: {total_reward}")
            print(f"Local steps {localsteps}")
            print(f"Experiences {len(self.buffer)}")
            print(f"Global step {self.global_steps}")
            print(f"Noise stdev {self.NOISE_STDDEV}")
            print(f"recent average score {recent_average_score}")
            print()

            if (self.hiscore is None) or (recent_average_score > self.hiscore):
                self.hiscore = recent_average_score
                print(f"HISCORE Updated: {self.hiscore}")
                self.save_model()

        return total_rewards

    def play_episode(self):

        total_reward = 0

        steps = 0

        done = False

        state = self.env.reset()

        while not done:

            action = self.actor.sample_action(state, noise=self.NOISE_STDDEV)

            next_state, reward, done, _ = self.env.step(action)

            exp = Experience(state, action, reward, next_state, done)

            self.buffer.add_experience(exp)

            state = next_state

            total_reward += reward

            steps += 1

            self.global_steps += 1

            #: Delayed Policy update
            if self.global_steps % self.CRITIC_UPDATE_PERIOD == 0:
                if self.global_steps % self.POLICY_UPDATE_PERIOD == 0:
                    self.update_network(self.BATCH_SIZE, update_policy=True)
                    self.update_target_network()
                else:
                    self.update_network(self.BATCH_SIZE)

        return total_reward, steps

    def update_network(self, batch_size, update_policy=False):

        if len(self.buffer) < self.MIN_EXPERIENCES:
            return

        (states, actions, rewards, next_states,
         dones) = self.buffer.get_minibatch(batch_size)

        clipped_noise = np.clip(np.random.normal(0, 0.2, self.ACTION_SPACE),
                                -0.5, 0.5)

        next_actions = self.target_actor(
            next_states) + clipped_noise * self.MAX_ACTION

        q1, q2 = self.target_critic(next_states, next_actions)

        next_qvalues = [
            min(q1, q2) for q1, q2 in zip(q1.numpy().flatten(),
                                          q2.numpy().flatten())
        ]

        #: Compute taeget values and update CriticNetwork
        target_values = np.vstack([
            reward + self.GAMMA * next_qvalue if not done else reward
            for reward, done, next_qvalue in zip(rewards, dones, next_qvalues)
        ]).astype(np.float32)

        #: Update Critic
        with tf.GradientTape() as tape:
            q1, q2 = self.critic(states, actions)
            loss1 = tf.reduce_mean(tf.square(target_values - q1))
            loss2 = tf.reduce_mean(tf.square(target_values - q2))
            loss = loss1 + loss2

        variables = self.critic.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.critic.optimizer.apply_gradients(zip(gradients, variables))

        #: Delayed Update ActorNetwork
        if update_policy:

            with tf.GradientTape() as tape:
                q1, _ = self.critic(states, self.actor(states))
                J = -1 * tf.reduce_mean(q1)

            variables = self.actor.trainable_variables
            gradients = tape.gradient(J, variables)
            self.actor.optimizer.apply_gradients(zip(gradients, variables))

    def update_target_network(self):

        # soft-target update Actor
        target_actor_weights = self.target_actor.get_weights()
        actor_weights = self.actor.get_weights()

        assert len(target_actor_weights) == len(actor_weights)

        self.target_actor.set_weights((1 - self.TAU) *
                                      np.array(target_actor_weights) +
                                      (self.TAU) * np.array(actor_weights))

        # soft-target update Critic
        target_critic_weights = self.target_critic.get_weights()
        critic_weights = self.critic.get_weights()

        assert len(target_critic_weights) == len(critic_weights)

        self.target_critic.set_weights((1 - self.TAU) *
                                       np.array(target_critic_weights) +
                                       (self.TAU) * np.array(critic_weights))

    def save_model(self):

        self.actor.save_weights("checkpoints/actor")

        self.critic.save_weights("checkpoints/critic")

    def load_model(self):

        self.actor.load_weights("checkpoints/actor")

        self.target_actor.load_weights("checkpoints/actor")

        self.critic.load_weights("checkpoints/critic")

        self.target_critic.load_weights("checkpoints/critic")

    def test_play(self, n, monitordir, load_model=False):

        if load_model:
            self.load_model()

        if monitordir:
            env = wrappers.Monitor(gym.make(self.ENV_ID),
                                   monitordir,
                                   force=True,
                                   video_callable=(lambda ep: ep % 1 == 0))
        else:
            env = gym.make(self.ENV_ID)

        for i in range(n):

            total_reward = 0

            steps = 0

            done = False

            state = env.reset()

            while not done:

                action = self.actor.sample_action(state, noise=False)

                next_state, reward, done, _ = env.step(action)

                state = next_state

                total_reward += reward

                steps += 1

            print()
            print(f"Test Play {i}: {total_reward}")
            print(f"Steps:", steps)
            print()
Пример #7
0
    def __init__(self, env, config, reporter=None):
        super(DdpgHer).__init__()

        self.env = env
        self.config = {**DdpgHer._default_config, **config}
        self.seed(self.config['seed'])

        a_space, obs_space = self.env.action_space, self.env.observation_space
        obs_size = obs_space.spaces['observation'].shape[0]
        goal_size = obs_space.spaces['desired_goal'].shape[0]
        self.env_params = get_env_params(self.env)
        self.reporter = reporter

        if self.config['cuda'] is None:
            self.config['cuda'] = torch.cuda.is_available()

        if self.config['cuda']:
            n_gpus = torch.cuda.device_count()
            assert n_gpus > 0
            max_gpus = self.config['max_gpus']
            if max_gpus is None:
                max_gpus = n_gpus
            n_gpus = min(n_gpus, max_gpus)
            n_workers = MPI.COMM_WORLD.size
            rank = MPI.COMM_WORLD.rank
            w_per_gpu = int(np.ceil(n_workers / n_gpus))
            gpu_i = rank // w_per_gpu
            print(f'Worker with rank {rank} assigned GPU {gpu_i}.')
            torch.cuda.set_device(gpu_i)

        self.bc_loss = self.config.get('demo_file') is not None
        self.q_filter = self.config['q_filter']

        # create the network
        self.actor_network = ActorNetwork(
            action_space=a_space,
            observation_space=obs_space,
            hidden_units=self.config['hidden_units'])
        self.critic_network = CriticNetwork(
            action_space=a_space,
            observation_space=obs_space,
            hidden_units=self.config['hidden_units'])

        # sync the networks across the cpus
        sync_networks(self.actor_network)
        sync_networks(self.critic_network)

        # build up the target network
        self.actor_target_network = ActorNetwork(
            action_space=a_space,
            observation_space=obs_space,
            hidden_units=self.config['hidden_units'])
        self.critic_target_network = CriticNetwork(
            action_space=a_space,
            observation_space=obs_space,
            hidden_units=self.config['hidden_units'])

        # load the weights into the target networks
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(
            self.critic_network.state_dict())

        # if use gpu
        if self.config['cuda']:
            self.actor_network.cuda()
            self.critic_network.cuda()
            self.actor_target_network.cuda()
            self.critic_target_network.cuda()

        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=self.config['lr_actor'])
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                             lr=self.config['lr_critic'])

        # goal_space_bins should be of the form:
        # [dict(axis=0, box=np.linspace(0.0, 2.0, 15)), dict(axis=1, box=np.linspace(0.0, 2.0, 15)), ...]
        weight_her_sampling = False
        self._num_reached_goals_in_bin = None
        self._num_visited_goals_in_bin = None
        self._num_observed_goals_in_bin = None
        self._goal_space_bins = self.config['goal_space_bins']
        if self._goal_space_bins is not None:
            weight_her_sampling = True
            self._num_reached_goals_in_bin = np.zeros(
                tuple(1 + b['box'].size for b in self._goal_space_bins))
            self._num_visited_goals_in_bin = self._num_reached_goals_in_bin.copy(
            )
            self._num_observed_goals_in_bin = self._num_reached_goals_in_bin.copy(
            )

        # her sampler
        self.her_module = HerSampler(
            self.config['replay_strategy'],
            self.config['replay_k'],
            self.env.compute_reward,
            weight_sampling=weight_her_sampling,
            archer_params=self.config['archer_params'])

        # create the normalizer
        self.o_norm = Normalizer(size=obs_size,
                                 default_clip_range=self.config['clip_range'])
        self.g_norm = Normalizer(size=goal_size,
                                 default_clip_range=self.config['clip_range'])

        # create the replay and demo buffers
        self.buffer = ReplayBuffer(self.env_params, self.config['buffer_size'],
                                   self.her_module.sample_her_transitions)
        self.demo_buffer = None
        if self.bc_loss:
            self._init_demo_buffer(update_stats=True)

        self._trained = False
Пример #8
0
class DdpgHer(object):

    _default_config = {
        'n_epochs': 50,
        'n_cycles': 50,
        'n_batches': 40,
        'checkpoint_freq': 5,
        'seed': 123,
        'num_workers': 1,
        'replay_strategy': 'future',
        'clip_return': 50.,
        'noise_eps': 0.2,
        'random_eps': 0.3,
        'buffer_size': int(1e6),
        'replay_k': 4,
        'clip_obs': 200.,
        'batch_size': 256,
        'hidden_units': 256,
        'gamma': 0.98,
        'action_l2': 1.,
        'lr_actor': 0.001,
        'lr_critic': 0.001,
        'polyak': 0.95,
        'n_test_rollouts': 10,
        'clip_range': 5.,
        'demo_length': 20,
        'local_dir': None,
        'cuda': None,
        'max_gpus': None,
        'rollouts_per_worker': 2,
        'goal_space_bins': None,
        'archer_params': None,
        'q_filter': False,
        'prm_loss_weight': 0.001,
        'aux_loss_weight': 0.0078,
        'demo_batch_size': None,
        'demo_file': None,
        'num_demo': 100,
    }

    def __init__(self, env, config, reporter=None):
        super(DdpgHer).__init__()

        self.env = env
        self.config = {**DdpgHer._default_config, **config}
        self.seed(self.config['seed'])

        a_space, obs_space = self.env.action_space, self.env.observation_space
        obs_size = obs_space.spaces['observation'].shape[0]
        goal_size = obs_space.spaces['desired_goal'].shape[0]
        self.env_params = get_env_params(self.env)
        self.reporter = reporter

        if self.config['cuda'] is None:
            self.config['cuda'] = torch.cuda.is_available()

        if self.config['cuda']:
            n_gpus = torch.cuda.device_count()
            assert n_gpus > 0
            max_gpus = self.config['max_gpus']
            if max_gpus is None:
                max_gpus = n_gpus
            n_gpus = min(n_gpus, max_gpus)
            n_workers = MPI.COMM_WORLD.size
            rank = MPI.COMM_WORLD.rank
            w_per_gpu = int(np.ceil(n_workers / n_gpus))
            gpu_i = rank // w_per_gpu
            print(f'Worker with rank {rank} assigned GPU {gpu_i}.')
            torch.cuda.set_device(gpu_i)

        self.bc_loss = self.config.get('demo_file') is not None
        self.q_filter = self.config['q_filter']

        # create the network
        self.actor_network = ActorNetwork(
            action_space=a_space,
            observation_space=obs_space,
            hidden_units=self.config['hidden_units'])
        self.critic_network = CriticNetwork(
            action_space=a_space,
            observation_space=obs_space,
            hidden_units=self.config['hidden_units'])

        # sync the networks across the cpus
        sync_networks(self.actor_network)
        sync_networks(self.critic_network)

        # build up the target network
        self.actor_target_network = ActorNetwork(
            action_space=a_space,
            observation_space=obs_space,
            hidden_units=self.config['hidden_units'])
        self.critic_target_network = CriticNetwork(
            action_space=a_space,
            observation_space=obs_space,
            hidden_units=self.config['hidden_units'])

        # load the weights into the target networks
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(
            self.critic_network.state_dict())

        # if use gpu
        if self.config['cuda']:
            self.actor_network.cuda()
            self.critic_network.cuda()
            self.actor_target_network.cuda()
            self.critic_target_network.cuda()

        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=self.config['lr_actor'])
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                             lr=self.config['lr_critic'])

        # goal_space_bins should be of the form:
        # [dict(axis=0, box=np.linspace(0.0, 2.0, 15)), dict(axis=1, box=np.linspace(0.0, 2.0, 15)), ...]
        weight_her_sampling = False
        self._num_reached_goals_in_bin = None
        self._num_visited_goals_in_bin = None
        self._num_observed_goals_in_bin = None
        self._goal_space_bins = self.config['goal_space_bins']
        if self._goal_space_bins is not None:
            weight_her_sampling = True
            self._num_reached_goals_in_bin = np.zeros(
                tuple(1 + b['box'].size for b in self._goal_space_bins))
            self._num_visited_goals_in_bin = self._num_reached_goals_in_bin.copy(
            )
            self._num_observed_goals_in_bin = self._num_reached_goals_in_bin.copy(
            )

        # her sampler
        self.her_module = HerSampler(
            self.config['replay_strategy'],
            self.config['replay_k'],
            self.env.compute_reward,
            weight_sampling=weight_her_sampling,
            archer_params=self.config['archer_params'])

        # create the normalizer
        self.o_norm = Normalizer(size=obs_size,
                                 default_clip_range=self.config['clip_range'])
        self.g_norm = Normalizer(size=goal_size,
                                 default_clip_range=self.config['clip_range'])

        # create the replay and demo buffers
        self.buffer = ReplayBuffer(self.env_params, self.config['buffer_size'],
                                   self.her_module.sample_her_transitions)
        self.demo_buffer = None
        if self.bc_loss:
            self._init_demo_buffer(update_stats=True)

        self._trained = False

    def _bin_idx_for_goals(self, goals: np.ndarray):
        assert self._goal_space_bins is not None
        return tuple(
            np.digitize(goals[..., b['axis']], b['box'], right=False)
            for b in self._goal_space_bins)

    def _get_info_for_goals(self, goals: np.ndarray):
        assert self._goal_space_bins is not None
        idx = self._bin_idx_for_goals(goals)
        times_success = self._num_reached_goals_in_bin[idx]
        times_visited = self._num_visited_goals_in_bin[idx]
        times_observed = self._num_observed_goals_in_bin[idx]
        tot_success = self._num_reached_goals_in_bin.sum()
        tot_visited = self._num_visited_goals_in_bin.sum()
        tot_observed = self._num_observed_goals_in_bin.sum()
        return (
            times_success,
            tot_success,
            times_visited,
            tot_visited,
            times_observed,
            tot_observed,
        )

    def seed(self, value):
        import random
        np.random.seed(value)
        random.seed(value)
        torch.manual_seed(value)
        self.env.seed(value)

    def _training_step(self):
        rollout_times = []
        update_times = []
        update_results = []
        taken_steps = 0
        failed_steps = 0
        sampling_tot_time = 0.0
        sampling_calls = 0
        step_tic = datetime.now()
        for _ in range(self.config['n_cycles']):
            mb_obs, mb_ag, mb_g, mb_actions = [], [], [], []
            while len(mb_obs) < self.config["rollouts_per_worker"]:
                tic = datetime.now()
                step_failure = False
                # reset the rollouts
                ep_obs, ep_ag, ep_g, ep_actions = [], [], [], []
                # reset the environment
                observation = self.env.reset()
                obs = observation['observation']
                ag = observation['achieved_goal']
                g = observation['desired_goal']

                if self._goal_space_bins is not None:
                    goal_idx = self._bin_idx_for_goals(g)
                    self._num_observed_goals_in_bin[goal_idx] += 1

                # start to collect samples
                for t in range(self.env_params['max_timesteps']):
                    with torch.no_grad():
                        input_tensor = self._preproc_inputs(obs, g)
                        pi = self.actor_network(input_tensor)
                        action = self._select_actions(pi)

                    try:
                        observation_new, _, _, info = self.env.step(action)
                    except MujocoException:
                        step_failure = True
                        break

                    obs_new = observation_new['observation']
                    ag_new = observation_new['achieved_goal']

                    if self._goal_space_bins is not None:
                        goal_idx = self._bin_idx_for_goals(ag_new)
                        self._num_visited_goals_in_bin[goal_idx] += 1
                        if bool(info['is_success']):
                            self._num_reached_goals_in_bin[goal_idx] += 1

                    # append rollouts
                    ep_obs.append(obs.copy())
                    ep_ag.append(ag.copy())
                    ep_g.append(g.copy())
                    ep_actions.append(action.copy())
                    # re-assign the observation
                    obs = obs_new
                    ag = ag_new
                ep_obs.append(obs.copy())
                ep_ag.append(ag.copy())

                if step_failure:
                    failed_steps += 1
                    continue

                taken_steps += self.env_params['max_timesteps']
                mb_obs.append(ep_obs)
                mb_ag.append(ep_ag)
                mb_g.append(ep_g)
                mb_actions.append(ep_actions)
                rollout_times.append((datetime.now() - tic).total_seconds())

            # convert them into arrays
            mb_obs = np.array(mb_obs)
            mb_ag = np.array(mb_ag)
            mb_g = np.array(mb_g)
            mb_actions = np.array(mb_actions)
            # store the episodes
            self.buffer.store_episode([mb_obs, mb_ag, mb_g, mb_actions])
            self._update_normalizer([mb_obs, mb_ag, mb_g, mb_actions])

            tic = datetime.now()
            # train the network
            for _ in range(self.config['n_batches']):
                # sample the episodes
                sampling_tic = datetime.now()
                sampled_transitions = self._sample_batch()
                sampling_tot_time += (datetime.now() -
                                      sampling_tic).total_seconds()
                sampling_calls += 1
                res = self._update_network(sampled_transitions)
                update_results.append(res)
            # soft update
            self._soft_update_target_network(self.actor_target_network,
                                             self.actor_network)
            self._soft_update_target_network(self.critic_target_network,
                                             self.critic_network)
            update_times.append((datetime.now() - tic).total_seconds())
        step_time = (datetime.now() - step_tic).total_seconds()

        tic = datetime.now()
        success_rate, avg_ep_reward = self._eval_agent()
        eval_time = (datetime.now() - tic).total_seconds()

        update_results_dict = dict()
        for k in update_results[0].keys():
            update_results_dict['avg_' + k] = np.mean(
                [r[k] for r in update_results])

        return {
            "test_success_rate": success_rate,
            "test_mean_ep_reward": avg_ep_reward,
            "avg_her_sampling_time": sampling_tot_time / sampling_calls,
            "avg_rollout_time": np.mean(rollout_times),
            "avg_network_update_time": np.mean(update_times),
            "evaluation_time": eval_time,
            "step_time": step_time,
            "env_steps": taken_steps,
            "failed_steps": failed_steps,
            **update_results_dict,
        }

    def _init_demo_buffer(self, update_stats=True):
        assert self.bc_loss
        file_path = self.config['demo_file']
        num_demo = self.config['num_demo']
        self.demo_buffer = ReplayBuffer(self.env_params,
                                        self.config['buffer_size'],
                                        self.her_module.sample_her_transitions)

        # data must be a dictionary of (at least) 4 lists; each list contains partial information for each episode.
        data = pickle.load(open(file_path, 'rb'))
        assert isinstance(data, dict)

        ordered_data = []
        for k in ['mb_obs', 'mb_ag', 'mb_g', 'mb_actions']:
            mb_data = np.asarray(data[k])
            assert len(mb_data) >= num_demo
            ordered_data.append(mb_data[:num_demo])

        self.demo_buffer.store_episode(ordered_data)
        if update_stats:
            self._update_normalizer(ordered_data)

    def _sample_batch(self):
        batch_size = self.config['batch_size']
        sample_kwargs = dict()
        if self._goal_space_bins is not None:
            sample_kwargs['get_info_for_goals'] = self._get_info_for_goals
        if self.bc_loss:
            demo_batch_size = self.config['demo_batch_size']
            transitions = self.buffer.sample(batch_size - demo_batch_size,
                                             **sample_kwargs)
            transitions_demo = self.demo_buffer.sample(demo_batch_size)
            for k, values in transitions_demo.items():
                rollout_vec = transitions[k].tolist()
                for v in values:
                    rollout_vec.append(v.tolist())
                transitions[k] = np.array(rollout_vec)
        else:
            transitions = self.buffer.sample(batch_size, **sample_kwargs)
        return transitions

    def save_checkpoint(self, epoch=0):
        local_dir = self.config.get('local_dir')
        if local_dir is not None:
            local_dir = local_dir + '/checkpoints'
            os.makedirs(local_dir, exist_ok=True)
            model_path = f'{local_dir}/model_{epoch}.pt'
            status_path = f'{local_dir}/status_{epoch}.pkl'
            torch.save([
                self.o_norm.mean, self.o_norm.std, self.g_norm.mean,
                self.g_norm.std,
                self.actor_network.state_dict()
            ], model_path)
            with open(status_path, 'wb') as f:
                pickle.dump(dict(config=self.config), f)

    @staticmethod
    def load(env, local_dir, epoch=None):
        epoch = epoch or '*[0-9]'
        models = glob.glob(f'{local_dir}/model_{epoch}.pt')
        assert len(models) > 0, "No checkpoints found!"

        model_path = sorted(models, key=os.path.getmtime)[-1]
        epoch = model_path.split("_")[-1].split(".")[0]
        status_path = f'{local_dir}/status_{epoch}.pkl'

        with open(status_path, 'rb') as f:
            status = pickle.load(f)
        status['config']['cuda'] = torch.cuda.is_available()
        agent = DdpgHer(env, status['config'])
        agent._trained = True

        o_mean, o_std, g_mean, g_std, actor_state = torch.load(
            model_path, map_location=lambda storage, loc: storage)

        agent.o_norm.mean = o_mean
        agent.o_norm.std = o_std
        agent.g_norm.mean = g_mean
        agent.g_norm.std = g_std

        agent.actor_network.load_state_dict(actor_state)
        agent.actor_network.eval()
        print(f'Loaded model for epoch {epoch}.')
        return agent

    def predict(self, obs):
        if not self._trained:
            raise RuntimeError
        g = obs['desired_goal']
        obs = obs['observation']
        with torch.no_grad():
            inputs = self._preproc_inputs(obs, g)
            pi = self.actor_network(inputs)
            action = pi.cpu().numpy().squeeze()
        return action

    def train(self):
        if self._trained:
            raise RuntimeError

        # make sure that different workers have different seeds
        # (from baselines' original implementation)
        local_uniform = np.random.uniform(size=(1, ))
        root_uniform = local_uniform.copy()
        MPI.COMM_WORLD.Bcast(root_uniform, root=0)
        if MPI.COMM_WORLD.Get_rank() != 0:
            assert local_uniform[0] != root_uniform[0]

        tic = datetime.now()
        n_epochs = self.config.get('n_epochs')
        saved_checkpoints = 0
        total_env_steps = 0

        for iter_i in it.count():
            if n_epochs is not None and iter_i >= n_epochs:
                break
            res = self._training_step()
            total_env_steps += res['env_steps']

            if MPI.COMM_WORLD.Get_rank() == 0:
                if (iter_i + 1) % self.config['checkpoint_freq'] == 0:
                    self.save_checkpoint(epoch=(iter_i + 1))
                    saved_checkpoints += 1
                if callable(self.reporter):
                    self.reporter(
                        **{
                            **res,
                            "training_iteration": iter_i + 1,
                            "total_time": (datetime.now() -
                                           tic).total_seconds(),
                            "checkpoints": saved_checkpoints,
                            "total_env_steps": total_env_steps,
                            "current_buffer_size": self.buffer.current_size,
                        })

    # pre_process the inputs
    def _preproc_inputs(self, obs, g):
        obs_norm = self.o_norm.normalize(obs)
        g_norm = self.g_norm.normalize(g)
        # concatenate the stuffs
        inputs = np.concatenate([obs_norm, g_norm])
        inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)
        if self.config['cuda']:
            inputs = inputs.cuda()
        return inputs

    # this function will choose action for the agent and do the exploration
    def _select_actions(self, pi):
        action = pi.cpu().numpy().squeeze()
        # add the gaussian
        action += self.config['noise_eps'] * self.env_params[
            'action_max'] * np.random.randn(*action.shape)
        action = np.clip(action, -self.env_params['action_max'],
                         self.env_params['action_max'])
        # random actions...
        random_actions = np.random.uniform(low=-self.env_params['action_max'],
                                           high=self.env_params['action_max'],
                                           size=self.env_params['action'])
        # choose if use the random actions
        action += np.random.binomial(1, self.config['random_eps'],
                                     1)[0] * (random_actions - action)
        return action

    # update the normalizer
    def _update_normalizer(self, episode_batch):
        mb_obs, mb_ag, mb_g, mb_actions = episode_batch
        mb_obs_next = mb_obs[:, 1:, :]
        mb_ag_next = mb_ag[:, 1:, :]
        # get the number of normalization transitions
        num_transitions = mb_actions.shape[1]
        # create the new buffer to store them
        buffer_temp = {
            'obs': mb_obs,
            'ag': mb_ag,
            'g': mb_g,
            'actions': mb_actions,
            'obs_next': mb_obs_next,
            'ag_next': mb_ag_next,
        }
        transitions = self.her_module.sample_her_transitions(
            buffer_temp, num_transitions)
        obs, g = transitions['obs'], transitions['g']
        # pre process the obs and g
        transitions['obs'], transitions['g'] = self._preproc_og(obs, g)
        # update
        self.o_norm.update(transitions['obs'])
        self.g_norm.update(transitions['g'])
        # recompute the stats
        self.o_norm.recompute_stats()
        self.g_norm.recompute_stats()

    def _preproc_og(self, o, g):
        o = np.clip(o, -self.config['clip_obs'], self.config['clip_obs'])
        g = np.clip(g, -self.config['clip_obs'], self.config['clip_obs'])
        return o, g

    # soft update
    def _soft_update_target_network(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_((1 - self.config['polyak']) * param.data +
                                    self.config['polyak'] * target_param.data)

    # update the network
    def _update_network(self, transitions):

        # pre-process the observation and goal
        o, o_next, g = transitions['obs'], transitions[
            'obs_next'], transitions['g']
        transitions['obs'], transitions['g'] = self._preproc_og(o, g)
        transitions['obs_next'], transitions['g_next'] = self._preproc_og(
            o_next, g)

        # start to do the update
        obs_norm = self.o_norm.normalize(transitions['obs'])
        g_norm = self.g_norm.normalize(transitions['g'])
        inputs_norm = np.concatenate([obs_norm, g_norm], axis=1)
        obs_next_norm = self.o_norm.normalize(transitions['obs_next'])
        g_next_norm = self.g_norm.normalize(transitions['g_next'])
        inputs_next_norm = np.concatenate([obs_next_norm, g_next_norm], axis=1)

        # transfer them into the tensor
        inputs_norm_tensor = torch.tensor(inputs_norm, dtype=torch.float32)
        inputs_next_norm_tensor = torch.tensor(inputs_next_norm,
                                               dtype=torch.float32)
        actions_tensor = torch.tensor(transitions['actions'],
                                      dtype=torch.float32)
        r_tensor = torch.tensor(transitions['r'], dtype=torch.float32)

        if self.config['cuda']:
            inputs_norm_tensor = inputs_norm_tensor.cuda()
            inputs_next_norm_tensor = inputs_next_norm_tensor.cuda()
            actions_tensor = actions_tensor.cuda()
            r_tensor = r_tensor.cuda()

        # calculate the target Q value function
        with torch.no_grad():
            # do the normalization
            # concatenate the stuffs
            actions_next = self.actor_target_network(inputs_next_norm_tensor)
            q_next_value = self.critic_target_network(inputs_next_norm_tensor,
                                                      actions_next)
            q_next_value = q_next_value.detach()
            target_q_value = r_tensor + self.config['gamma'] * q_next_value
            target_q_value = target_q_value.detach()
            # clip the q value
            clip_return = 1 / (1 - self.config['gamma'])
            target_q_value = torch.clamp(target_q_value, -clip_return, 0)

        # the q loss
        real_q_value = self.critic_network(inputs_norm_tensor, actions_tensor)
        critic_loss = (target_q_value - real_q_value).pow(2).mean()

        # self.main.Q_tf ==> real_q_value
        # self.main.Q_pi_tf ==> self.critic_network(inputs_norm_tensor, actions_real) ==> approx_q_value

        # the actor loss
        action_l2 = self.config['action_l2']
        actions_real = self.actor_network(inputs_norm_tensor)
        approx_q_value = self.critic_network(inputs_norm_tensor, actions_real)

        if self.bc_loss:
            # train with demonstrations using behavior cloning

            # choose only the demo buffer samples
            b_size = self.config['batch_size']
            demo_b_size = self.config['demo_batch_size']
            mask = np.concatenate(
                (np.zeros(b_size - demo_b_size), np.ones(demo_b_size)), axis=0)
            mask = torch.tensor(mask,
                                dtype=torch.uint8,
                                device=actions_real.device)

            if self.q_filter:
                # use Q-filter trick to perform BC only when needed
                with torch.no_grad():
                    mask &= (real_q_value > approx_q_value).squeeze()

            prm_loss_weight = self.config['prm_loss_weight']
            cloning_loss = self.config['aux_loss_weight'] * (
                actions_real[mask] - actions_tensor[mask]).pow(2).sum()
        else:
            # train without demonstrations
            prm_loss_weight = 1.0
            cloning_loss = None

        actor_loss = -prm_loss_weight * approx_q_value.mean()
        actor_loss += prm_loss_weight * action_l2 * (
            actions_real / self.env_params['action_max']).pow(2).mean()

        if cloning_loss is not None:
            actor_loss += cloning_loss

        # update actor network
        self.actor_optim.zero_grad()
        actor_loss.backward()
        sync_grads(self.actor_network)
        self.actor_optim.step()

        # update critic network
        self.critic_optim.zero_grad()
        critic_loss.backward()
        sync_grads(self.critic_network)
        self.critic_optim.step()

        res = dict(actor_loss=actor_loss.item(),
                   critic_loss=critic_loss.item())
        if cloning_loss is not None:
            res['cloning_loss'] = cloning_loss.item()
        return res

    # do the evaluation
    def _eval_agent(self):
        total_success_rate = []
        ep_rewards = []
        for _ in range(self.config['n_test_rollouts']):
            per_success_rate = []
            ep_reward = 0.0
            observation = self.env.reset()
            obs = observation['observation']
            g = observation['desired_goal']
            for _ in range(self.env_params['max_timesteps']):
                with torch.no_grad():
                    input_tensor = self._preproc_inputs(obs, g)
                    pi = self.actor_network(input_tensor)
                    # convert the actions
                    actions = pi.detach().cpu().numpy().squeeze()
                observation_new, rew, _, info = self.env.step(actions)
                obs = observation_new['observation']
                g = observation_new['desired_goal']
                per_success_rate.append(info['is_success'])
                ep_reward += rew
            ep_rewards.append(ep_reward)
            total_success_rate.append(per_success_rate)
        total_success_rate = np.array(total_success_rate)
        local_success_rate = np.mean(total_success_rate[:, -1])
        global_success_rate = MPI.COMM_WORLD.allreduce(local_success_rate,
                                                       op=MPI.SUM)
        global_success_rate /= MPI.COMM_WORLD.Get_size()

        avg_ep_reward = np.array(ep_rewards).mean()
        global_avg_ep_reward = MPI.COMM_WORLD.allreduce(avg_ep_reward,
                                                        op=MPI.SUM)
        global_avg_ep_reward /= MPI.COMM_WORLD.Get_size()

        return global_success_rate, global_avg_ep_reward
class PPOAgent:

    GAMMA = 0.99

    GAE_LAMBDA = 0.95

    CLIPRANGE = 0.2

    OPT_ITER = 20

    BATCH_SIZE = 2048

    def __init__(self,
                 env_id,
                 action_space,
                 trajectory_size=256,
                 n_envs=1,
                 max_timesteps=1500):

        self.env_id = env_id

        self.n_envs = n_envs

        self.trajectory_size = trajectory_size

        self.vecenv = VecEnv(env_id=self.env_id,
                             n_envs=self.n_envs,
                             max_timesteps=max_timesteps)

        self.policy = PolicyNetwork(action_space=action_space)

        self.old_policy = PolicyNetwork(action_space=action_space)

        self.critic = CriticNetwork()

        self.r_running_stats = util.RunningStats(shape=(action_space, ))

        self._init_network()

    def _init_network(self):

        env = gym.make(self.env_id)

        state = np.atleast_2d(env.reset())

        self.policy(state)

        self.old_policy(state)

    def run(self, n_updates, logdir):

        self.summary_writer = tf.summary.create_file_writer(str(logdir))

        history = {"steps": [], "scores": []}

        states = self.vecenv.reset()

        hiscore = None

        for epoch in range(n_updates):

            for _ in range(self.trajectory_size):

                actions = self.policy.sample_action(states)

                next_states = self.vecenv.step(actions)

                states = next_states

            trajectories = self.vecenv.get_trajectories()

            for trajectory in trajectories:
                self.r_running_stats.update(trajectory["r"])

            trajectories = self.compute_advantage(trajectories)

            states, actions, advantages, vtargs = self.create_minibatch(
                trajectories)

            vloss = self.update_critic(states, vtargs)

            self.update_policy(states, actions, advantages)

            global_steps = (epoch + 1) * self.trajectory_size * self.n_envs
            train_scores = np.array([traj["r"].sum() for traj in trajectories])

            if epoch % 1 == 0:
                test_scores, total_steps = self.play(n=1)
                test_scores, total_steps = np.array(test_scores), np.array(
                    total_steps)
                history["steps"].append(global_steps)
                history["scores"].append(test_scores.mean())
                ma_score = sum(history["scores"][-10:]) / 10
                with self.summary_writer.as_default():
                    tf.summary.scalar("test_score",
                                      test_scores.mean(),
                                      step=epoch)
                    tf.summary.scalar("test_steps",
                                      total_steps.mean(),
                                      step=epoch)
                print(
                    f"Epoch {epoch}, {global_steps//1000}K, {test_scores.mean()}"
                )

            if epoch // 10 > 10 and (hiscore is None or ma_score > hiscore):
                self.save_model()
                hiscore = ma_score
                print("Model Saved")

            with self.summary_writer.as_default():
                tf.summary.scalar("value_loss", vloss, step=epoch)
                tf.summary.scalar("train_score",
                                  train_scores.mean(),
                                  step=epoch)

        return history

    def compute_advantage(self, trajectories):
        """
            Generalized Advantage Estimation (GAE, 2016)
        """

        for trajectory in trajectories:

            trajectory["v_pred"] = self.critic(trajectory["s"]).numpy()

            trajectory["v_pred_next"] = self.critic(trajectory["s2"]).numpy()

            is_nonterminals = 1 - trajectory["done"]

            normed_rewards = (trajectory["r"] /
                              (np.sqrt(self.r_running_stats.var) + 1e-4))

            deltas = normed_rewards + self.GAMMA * is_nonterminals * trajectory[
                "v_pred_next"] - trajectory["v_pred"]

            advantages = np.zeros_like(deltas, dtype=np.float32)

            lastgae = 0
            for i in reversed(range(len(deltas))):
                lastgae = deltas[
                    i] + self.GAMMA * self.GAE_LAMBDA * is_nonterminals[
                        i] * lastgae
                advantages[i] = lastgae

            trajectory["advantage"] = advantages

            trajectory["R"] = advantages + trajectory["v_pred"]

        return trajectories

    def update_policy(self, states, actions, advantages):

        self.old_policy.set_weights(self.policy.get_weights())

        indices = np.random.choice(range(states.shape[0]),
                                   (self.OPT_ITER, self.BATCH_SIZE))

        for i in range(self.OPT_ITER):

            idx = indices[i]

            old_means, old_stdevs = self.old_policy(states[idx])

            old_logprob = self.compute_logprob(old_means, old_stdevs,
                                               actions[idx])

            with tf.GradientTape() as tape:

                new_means, new_stdevs = self.policy(states[idx])

                new_logprob = self.compute_logprob(new_means, new_stdevs,
                                                   actions[idx])

                ratio = tf.exp(new_logprob - old_logprob)

                ratio_clipped = tf.clip_by_value(ratio, 1 - self.CLIPRANGE,
                                                 1 + self.CLIPRANGE)

                loss_unclipped = ratio * advantages[idx]

                loss_clipped = ratio_clipped * advantages[idx]

                loss = tf.minimum(loss_unclipped, loss_clipped)

                loss = -1 * tf.reduce_mean(loss)

            grads = tape.gradient(loss, self.policy.trainable_variables)
            grads, _ = tf.clip_by_global_norm(grads, 0.5)
            self.policy.optimizer.apply_gradients(
                zip(grads, self.policy.trainable_variables))

    def update_critic(self, states, v_targs):

        losses = []

        indices = np.random.choice(range(states.shape[0]),
                                   (self.OPT_ITER, self.BATCH_SIZE))

        for i in range(self.OPT_ITER):

            idx = indices[i]

            old_vpred = self.critic(states[idx])

            with tf.GradientTape() as tape:

                vpred = self.critic(states[idx])

                vpred_clipped = old_vpred + tf.clip_by_value(
                    vpred - old_vpred, -self.CLIPRANGE, self.CLIPRANGE)

                loss = tf.maximum(tf.square(v_targs[idx] - vpred),
                                  tf.square(v_targs[idx] - vpred_clipped))

                loss = tf.reduce_mean(loss)

            grads = tape.gradient(loss, self.critic.trainable_variables)
            grads, _ = tf.clip_by_global_norm(grads, 0.5)
            self.critic.optimizer.apply_gradients(
                zip(grads, self.critic.trainable_variables))

            losses.append(loss)

        return np.array(losses).mean()

    @tf.function
    def compute_logprob(self, means, stdevs, actions):
        """ガウス分布の確率密度関数よりlogp(x)を計算
            logp(x) = -0.5 log(2π) - log(std)  -0.5 * ((x - mean) / std )^2
        """
        logprob = -0.5 * np.log(2 * np.pi)
        logprob += -tf.math.log(stdevs)
        logprob += -0.5 * tf.square((actions - means) / stdevs)
        logprob = tf.reduce_sum(logprob, axis=1, keepdims=True)
        return logprob

    def create_minibatch(self, trajectories):

        states = np.vstack([traj["s"] for traj in trajectories])
        actions = np.vstack([traj["a"] for traj in trajectories])

        advantages = np.vstack([traj["advantage"] for traj in trajectories])

        v_targs = np.vstack([traj["R"] for traj in trajectories])

        return states, actions, advantages, v_targs

    def save_model(self):

        self.policy.save_weights("checkpoints/policy")

        self.critic.save_weights("checkpoints/critic")

    def load_model(self):

        self.policy.load_weights("checkpoints/policy")

        self.critic.load_weights("checkpoints/critic")

    def play(self, n=1, monitordir=None, verbose=False):

        if monitordir:
            env = wrappers.Monitor(gym.make(self.env_id),
                                   monitordir,
                                   force=True,
                                   video_callable=(lambda ep: True))
        else:
            env = gym.make(self.env_id)

        total_rewards = []
        total_steps = []

        for _ in range(n):

            state = env.reset()

            done = False

            total_reward = 0

            steps = 0

            while not done:

                steps += 1

                action = self.policy.sample_action(state)

                next_state, reward, done, _ = env.step(action[0])

                if verbose:
                    mean, sd = self.policy(np.atleast_2d(state))
                    print(mean, sd)
                    print(reward)

                total_reward += reward

                if done:
                    break
                else:
                    state = next_state

            total_rewards.append(total_reward)
            total_steps.append(steps)
            print()
            print(total_reward, steps)
            print()

        return total_rewards, total_steps
class DDPGAgent:

    MAX_EXPERIENCES = 30000

    MIN_EXPERIENCES = 300

    ENV_ID = "Pendulum-v0"

    ACTION_SPACE = 1

    OBSERVATION_SPACE = 3

    UPDATE_PERIOD = 4

    START_EPISODES = 20

    TAU = 0.02

    GAMMA = 0.99

    BATCH_SIZE = 32

    def __init__(self):

        self.env = gym.make(self.ENV_ID)

        self.env.max_episode_steps = 1000

        self.actor_network = ActorNetwork(action_space=self.ACTION_SPACE)

        self.target_actor_network = ActorNetwork(
            action_space=self.ACTION_SPACE)

        self.critic_network = CriticNetwork()

        self.target_critic_network = CriticNetwork()

        self.stdev = 0.2

        self.buffer = ReplayBuffer(max_experiences=self.MAX_EXPERIENCES)

        self.global_steps = 0

        self.hiscore = None

        self._build_networks()

    def _build_networks(self):
        """パラメータの初期化
        """

        dummy_state = np.random.normal(0, 0.1, size=self.OBSERVATION_SPACE)
        dummy_state = (dummy_state[np.newaxis, ...]).astype(np.float32)

        dummy_action = np.random.normal(0, 0.1, size=self.ACTION_SPACE)
        dummy_action = (dummy_action[np.newaxis, ...]).astype(np.float32)

        self.actor_network.call(dummy_state)
        self.target_actor_network.call(dummy_state)
        self.target_actor_network.set_weights(self.actor_network.get_weights())

        self.critic_network.call(dummy_state, dummy_action, training=False)
        self.target_critic_network.call(dummy_state,
                                        dummy_action,
                                        training=False)
        self.target_critic_network.set_weights(
            self.critic_network.get_weights())

    def play(self, n_episodes):

        total_rewards = []

        recent_scores = collections.deque(maxlen=10)

        for n in range(n_episodes):

            if n <= self.START_EPISODES:
                total_reward, localsteps = self.play_episode(random=True)
            else:
                total_reward, localsteps = self.play_episode()

            total_rewards.append(total_reward)

            recent_scores.append(total_reward)

            recent_average_score = sum(recent_scores) / len(recent_scores)

            print(f"Episode {n}: {total_reward}")
            print(f"Local steps {localsteps}")
            print(f"Experiences {len(self.buffer)}")
            print(f"Global step {self.global_steps}")
            print(f"Noise stdev {self.stdev}")
            print(f"recent average score {recent_average_score}")
            print()

            if (self.hiscore is None) or (recent_average_score > self.hiscore):
                self.hiscore = recent_average_score
                print(f"HISCORE Updated: {self.hiscore}")
                self.save_model()

        return total_rewards

    def play_episode(self, random=False):

        total_reward = 0

        steps = 0

        done = False

        state = self.env.reset()

        while not done:

            if random:
                action = np.random.uniform(-2, 2, size=self.ACTION_SPACE)
            else:
                action = self.actor_network.sample_action(state,
                                                          noise=self.stdev)

            next_state, reward, done, _ = self.env.step(action)

            exp = Experience(state, action, reward, next_state, done)

            self.buffer.add_experience(exp)

            state = next_state

            total_reward += reward

            steps += 1

            self.global_steps += 1

            if self.global_steps % self.UPDATE_PERIOD == 0:
                self.update_network(self.BATCH_SIZE)
                self.update_target_network()

        return total_reward, steps

    def update_network(self, batch_size):

        if len(self.buffer) < self.MIN_EXPERIENCES:
            return

        (states, actions, rewards, next_states,
         dones) = self.buffer.get_minibatch(batch_size)

        next_actions = self.target_actor_network(next_states)

        next_qvalues = self.target_critic_network(
            next_states, next_actions).numpy().flatten()

        #: Compute taeget values and update CriticNetwork
        target_values = np.vstack([
            reward + self.GAMMA * next_qvalue if not done else reward
            for reward, done, next_qvalue in zip(rewards, dones, next_qvalues)
        ]).astype(np.float32)

        with tf.GradientTape() as tape:
            qvalues = self.critic_network(states, actions)
            loss = tf.reduce_mean(tf.square(target_values - qvalues))

        variables = self.critic_network.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.critic_network.optimizer.apply_gradients(zip(
            gradients, variables))

        #: Update ActorNetwork
        with tf.GradientTape() as tape:
            J = -1 * tf.reduce_mean(
                self.critic_network(states, self.actor_network(states)))

        variables = self.actor_network.trainable_variables
        gradients = tape.gradient(J, variables)
        self.actor_network.optimizer.apply_gradients(zip(gradients, variables))

    def update_target_network(self):

        # soft-target update Actor
        target_actor_weights = self.target_actor_network.get_weights()
        actor_weights = self.actor_network.get_weights()

        assert len(target_actor_weights) == len(actor_weights)

        self.target_actor_network.set_weights(
            (1 - self.TAU) * np.array(target_actor_weights) +
            (self.TAU) * np.array(actor_weights))

        # soft-target update Critic
        target_critic_weights = self.target_critic_network.get_weights()
        critic_weights = self.critic_network.get_weights()

        assert len(target_critic_weights) == len(critic_weights)

        self.target_critic_network.set_weights(
            (1 - self.TAU) * np.array(target_critic_weights) +
            (self.TAU) * np.array(critic_weights))

    def save_model(self):

        self.actor_network.save_weights("checkpoints/actor")

        self.critic_network.save_weights("checkpoints/critic")

    def load_model(self):

        self.actor_network.load_weights("checkpoints/actor")

        self.target_actor_network.load_weights("checkpoints/actor")

        self.critic_network.load_weights("checkpoints/critic")

        self.target_critic_network.load_weights("checkpoints/critic")

    def test_play(self, n, monitordir, load_model=False):

        if load_model:
            self.load_model()

        if monitordir:
            env = wrappers.Monitor(gym.make(self.ENV_ID),
                                   monitordir,
                                   force=True,
                                   video_callable=(lambda ep: ep % 1 == 0))
        else:
            env = gym.make(self.ENV_ID)

        for i in range(n):

            total_reward = 0

            steps = 0

            done = False

            state = env.reset()

            while not done:

                action = self.actor_network.sample_action(state, noise=False)

                next_state, reward, done, _ = env.step(action)

                state = next_state

                total_reward += reward

                steps += 1

            print()
            print(f"Test Play {i}: {total_reward}")
            print(f"Steps:", steps)
            print()
Пример #11
0
class DDPGAgent:
    """
    A DDPG Agent
    """
    def __init__(self,
                 state_dim,
                 action_dim,
                 lr_actor=1e-4,
                 lr_critic=1e-4,
                 lr_decay=.95,
                 replay_buff_size=10000,
                 gamma=.99,
                 batch_size=128,
                 random_seed=42,
                 soft_update_tau=1e-3,
                 actor_layer_dim_1=128,
                 actor_layer_dim_2=128,
                 actor_layer_dim_3=0,
                 critic_layer_dim_1=128,
                 critic_layer_dim_2=64,
                 critic_layer_dim_3=0):
        """
        Initialize model
        """
        self.lr_actor = lr_actor
        self.gamma = gamma
        self.lr_critic = lr_critic
        self.lr_decay = lr_decay
        self.tau = soft_update_tau

        self.actor_local = ActorNetwork(state_dim, action_dim,
                                        actor_layer_dim_1, actor_layer_dim_2,
                                        actor_layer_dim_3).to(device=device)
        self.actor_target = ActorNetwork(state_dim, action_dim,
                                         actor_layer_dim_1, actor_layer_dim_2,
                                         actor_layer_dim_3).to(device=device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)

        self.critic_local = CriticNetwork(state_dim, action_dim,
                                          critic_layer_dim_1,
                                          critic_layer_dim_2,
                                          critic_layer_dim_3).to(device=device)
        self.critic_target = CriticNetwork(
            state_dim, action_dim, critic_layer_dim_1, critic_layer_dim_2,
            critic_layer_dim_3).to(device=device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic)

        self.noise = OUNoise(action_dim, random_seed)

        self.memory = ReplayBuffer(action_dim, replay_buff_size, batch_size,
                                   random_seed)
        self.path = ""

    def update_model(self, state, action, reward, next_state, done):
        """
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        
        :experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
        :gamma (float): discount factor
        """
        self.memory.add(state, action, reward, next_state, done)
        if not self.memory.is_ready():
            return

        experiences = self.memory.sample()
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next *
                               (1 - dones)).detach()
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.smooth_l1_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def act(self, state, noise_t=0.0):
        """
        Returns actions for given state as per current policy.
        """
        if len(np.shape(state)) == 1:
            state = state.reshape(1, -1)
        state = torch.from_numpy(state).float().to(device=device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        action += self.noise.sample() * noise_t
        return np.clip(action, -1, 1).squeeze()

    def reset(self):
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        :local_model: PyTorch model (weights will be copied from)
        :target_model: PyTorch model (weights will be copied to)
        :tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #12
0
class Agent():
    """DDPG Agent"""
    def __init__(self, state_size, action_size,hd1_units=400, hd2_units=300 ,random_seed = 0, buffer_size = int(2e5), batch_size = 256, tau = 0.0005, actorLr =1e-3, criticLr = 1e-3, weight_decay = 0, update_every = 20, gamma = 0.99):
        """ :state_size (int): dimension of each state
            :action_size (int): dimension of each action
            :hd1_units (int) : number of the first hidden layer units
            :hd1_units (int) : number of the second hidden layer units
            :random_seed (int): random seed
            :buffer_size (int): replay buffer size
            :batch_size (int): batch size
            :tau (float): interpolation factor
            :actorLr (float): actor learning rate
            :criticLr (float): critic learning rate
            :weight_decay (float): Optimizer L2 penalty
            :update_every (int): learning frequency
            :gamma (float): Discount factor
        """
        self.state_size = state_size
        self.action_size = action_size
        self.update_every = update_every
        self.gamma = gamma
        self.tau = tau
        random.seed(random_seed)

        # Actor & Target Networks
        self.actor_local = ActorNetwork(state_size, action_size, random_seed, hd1_units, hd2_units).to(device)
        self.actor_target = ActorNetwork(state_size, action_size, random_seed, hd1_units, hd2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=actorLr, weight_decay = weight_decay)

        # Critic & Target Networks
        self.critic_local = CriticNetwork(state_size, action_size, random_seed, 400, 300).to(device)
        self.critic_target = CriticNetwork(state_size, action_size, random_seed, 400, 300).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=criticLr, weight_decay=weight_decay)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed)

        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # store transition
        self.memory.add(state, action, reward, next_state, done)
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
        # Learn, if enough samples are available in memory
            if len(self.memory) > 10000:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, eps, add_noise=True):
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        # manual action clipping
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """
            :experiences (Tuple): Transition parameters (s, a, r, s', done)
            :gamma (float): Discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Update critic
        # Get the predicted next-state actions and Q values from target nets
        with torch.no_grad():
            actions_next = self.actor_target(next_states)
            Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 5)
        self.critic_optimizer.step()

        # Updat Actor
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 5)
        self.actor_optimizer.step()

        # update targets
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """
            local_model: Source
            target_model: Destination
            tau (float):  Interpolation factor
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Пример #13
0
class DDPGAgent():
    def __init__(self, state_space, action_space, buffer_size, batch_size,learning_rate_actor, learning_rate_critic,update_rate, gamma, tau, device, seed, num_agents, epsilon, epsilon_decay, epsilon_min):
        self.num_agents = num_agents
        self.action_space = action_space
        self.state_space = state_space
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.step_count = 0.
        self.update_rate = update_rate
        self.tau = tau
        self.seed = seed
        self.device= device
        self.gamma = gamma
        self.actor_local_network = ActorNetwork(state_space, action_space, device, seed).to(device)
        self.actor_target_network = ActorNetwork(state_space, action_space, device, seed).to(device)
        self.critic_local_network = CriticNetwork(state_space, action_space, device, seed).to(device)
        self.critic_target_network = CriticNetwork(state_space, action_space, device, seed).to(device)
        
        
        self.actor_optimizer = torch.optim.Adam(self.actor_local_network.parameters(), lr=learning_rate_actor)
        self.critic_optimizer = torch.optim.Adam(self.critic_local_network.parameters(), lr=learning_rate_critic)
 
        self.noise = OUNoise(action_space, seed)
        self.memory = ReplayBuffer(buffer_size = self.buffer_size, batch_size=self.batch_size, 
                                   device=device, seed=seed)
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        
    def reset(self):
        self.noise.reset()
        
    def act(self, state, epsilon, add_noise = True):
#         if random.random() > epsilon:
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local_network.eval()
        with torch.no_grad():
            action = self.actor_local_network(state).cpu().data.numpy()
        self.actor_local_network.train()
        if add_noise:
            action += self.noise.sample()*self.epsilon
#         else:
#             action = np.random.randn(self.num_agents, self.action_space)
        return np.clip(action, -1,1)
    
        
    def step(self, states, actions, rewards, next_states, dones):
        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)
            #elf.memory.add(state, action, reward, next_state, done)
        self.step_count = (self.step_count+1) % self.update_rate 
        if self.step_count == 0 and len(self.memory)>self.batch_size:
            self.learn(self.gamma)
            
            
    def learn(self, gamma):
        # interaction between actor & critic network
        states, actions, rewards, next_states, dones = self.memory.sample()
        
        next_actions = self.actor_target_network(next_states)
        q_target_next = self.critic_target_network(next_states,next_actions) 
        q_target = rewards + gamma * q_target_next * (1-dones)
        q_expected = self.critic_local_network(states,actions) 
        critic_loss = F.mse_loss(q_expected, q_target)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        self.soft_update(self.critic_target_network, self.critic_local_network)
        
        actor_preds = self.actor_local_network(states)
        actor_loss = - self.critic_local_network(states, actor_preds).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        self.soft_update(self.actor_target_network , self.actor_local_network)
        
        self.epsilon -= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        self.noise.reset()
       
        
    def soft_update(self, target, local):
        for target_params, local_params in zip(target.parameters(), local.parameters()):
            target_params.data.copy_(self.tau*local_params.data + (1.0-self.tau)*target_params.data)