예제 #1
0
class Agent(object):
    def __init__(self, sess, hps, rm):
        self.sess = sess
        self.hps = hps
        self.rm = rm
        self.ou = OrnsteinUhlenbeck(hps['a_dim'])
        self.gamma = hps['gamma']
        self.tau = hps['tau']
        self.a_bound = hps['a_bound']
        self.noise_decay = hps['noise_decay']

        self.actor = Actor(self.sess, self.hps, 'actor', trainable=True)
        self.actor_target = Actor(self.sess,
                                  self.hps,
                                  'actor_target',
                                  trainable=False)
        self.critic = Critic(self.sess, self.hps, 'critic', trainable=True)
        self.critic_target = Critic(self.sess,
                                    self.hps,
                                    'critic_target',
                                    trainable=False)

        self.critic.build_train_op(self.actor, 'critic')
        self.actor.build_train_op(self.critic, 'actor')

        self.actor_soft_update_op = build_soft_update_op(
            self.sess, 'actor_target', 'actor', self.tau)
        self.critic_soft_update_op = build_soft_update_op(
            self.sess, 'critic_target', 'critic', self.tau)

    def explore(self, state, i):
        action = self.actor.act(state)
        # action += ( self.ou.sample() * self.a_bound * self.noise_decay ** i )
        action += (self.ou.sample() * self.noise_decay**i)
        return action

    def exploit(self, state):
        action = self.actor.act(state)
        return action

    def learn(self):
        s1, a1, r1, s2 = self.rm.sample()

        # Optimize critic
        a2 = self.actor_target.act(s2)
        q2 = self.critic_target.predict(s2, a2)
        y1 = r1 + self.gamma * q2
        loss, _ = self.critic.backward(s1, a1, y1)

        # Optimize actor
        loss, _ = self.actor.backward(s1)

        self.sess.run(self.actor_soft_update_op)
        self.sess.run(self.critic_soft_update_op)
예제 #2
0
class DDPGAgent(BaseAgent):
    def __init__(self, sess, hps, rm):

        # TODO: Here muss ich vermutlich auch noch die Parameter
        # für den BaseAgent einfügen
        super(DDPGAgent, self).__init__()

        self.sess = sess
        self.hps = hps
        self.rm = rm
        self.ou = OrnsteinUhlenbeck(hps['a_dim'])
        self.gamma = hps['gamma']
        self.tau = hps['tau']
        self.a_bound = hps['a_bound']
        self.noise_decay = hps['noise_decay']

        self.actor = Actor(self.sess, self.hps, 'actor', trainable=True)
        self.actor_target = Actor(self.sess,
                                  self.hps,
                                  'actor_target',
                                  trainable=False)
        self.critic = Critic(self.sess, self.hps, 'critic', trainable=True)
        self.critic_target = Critic(self.sess,
                                    self.hps,
                                    'critic_target',
                                    trainable=False)

        self.critic.build_train_op(self.actor, 'critic')
        self.actor.build_train_op(self.critic, 'actor')

        self.actor_soft_update_op = build_soft_update_op(
            self.sess, 'actor_target', 'actor', self.tau)
        self.critic_soft_update_op = build_soft_update_op(
            self.sess, 'critic_target', 'critic', self.tau)

    def explore(self, state, i):
        action = self.actor.act(state)
        # action += ( self.ou.sample() * self.a_bound * self.noise_decay ** i )
        action += (self.ou.sample() * self.noise_decay**i)
        return action

    def exploit(self, state):
        action = self.actor.act(state)
        return action

    def think(self, state, i):
        if self.hps['mode'] == 'training':
            self.explore(state, i)
        else:
            self.exploit(state)

    def learn(self):
        s1, a1, r1, s2 = self.rm.sample()

        # Optimize critic
        a2 = self.actor_target.act(s2)
        q2 = self.critic_target.predict(s2, a2)
        y1 = r1 + self.gamma * q2
        loss, _ = self.critic.backward(s1, a1, y1)

        # Optimize actor
        loss, _ = self.actor.backward(s1)

        self.sess.run(self.actor_soft_update_op)
        self.sess.run(self.critic_soft_update_op)
예제 #3
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if len(envs.observation_space.shape) == 3:
        actor_critic = Actor(obs_shape[0], envs.action_space,
                             args.recurrent_policy, envs.action_space.n)
        target_actor = Actor(obs_shape[0], envs.action_space,
                             args.recurrent_policy, envs.action_space.n)
        critic = Critic(in_channels=4, num_actions=envs.action_space.n)
        critic_target = Critic(in_channels=4, num_actions=envs.action_space.n)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if args.cuda:
        actor_critic.cuda()
        critic.cuda()
        critic_target.cuda()
        target_actor.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
        critic_optim = optim.Adam(critic.parameters(), lr=1e-4)
        gamma = 0.99
        tau = 0.001

    #memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length)
    mem_buffer = ReplayBuffer()

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size,
                              envs.action_space.n)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            value = critic.forward(
                Variable(rollouts.observations[step], volatile=True),
                action_log_prob)
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            pre_state = rollouts.observations[step].cpu().numpy()
            update_current_obs(obs)
            mem_buffer.add((pre_state, current_obs,
                            action_log_prob.data.cpu().numpy(), reward, done))
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        action, action_log_prob, states = actor_critic.act(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))  #[0].data

        next_value = critic.forward(
            Variable(rollouts.observations[-1], volatile=True),
            action_log_prob).data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if True:
            state, next_state, action, reward, done = mem_buffer.sample(5)
            next_state = next_state.reshape([-1, *obs_shape])
            state = state.reshape([-1, *obs_shape])
            action = action.reshape([-1, 6])
            next_q_values = critic_target(
                to_tensor(next_state, volatile=True),
                target_actor(to_tensor(next_state, volatile=True),
                             to_tensor(next_state, volatile=True),
                             to_tensor(next_state, volatile=True))[0])
            next_q_values.volatile = False
            target_q_batch = to_tensor(reward) + args.gamma * to_tensor(
                done.astype(np.float)) * next_q_values
            critic.zero_grad()
            q_batch = critic(to_tensor(state), to_tensor(action))
            value_loss = criterion(q_batch, target_q_batch)
            value_loss.backward()
            critic_optim.step()
            actor_critic.zero_grad()
            policy_loss = -critic(
                to_tensor(state),
                actor_critic(to_tensor(state), to_tensor(state),
                             to_tensor(state))[0])
            policy_loss = policy_loss.mean()
            policy_loss.backward()
            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)
            optimizer.step()
            soft_update(target_actor, actor_critic, tau)
            soft_update(critic_target, critic, tau)
        '''
        if args.algo in ['a2c', 'acktr']:
            action_log_probs, probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                                                                                           Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                                                                                           Variable(rollouts.masks[:-1].view(-1, 1)),
                                                                                           Variable(rollouts.actions.view(-1, action_shape)))
            values = critic.forward(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), probs).data

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

            #advantages = Variable(rollouts.returns[:-1]) - values
            advantages = rollouts.returns[:-1] - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages) * action_log_probs).mean()
            #action_loss = -(Variable(advantages.data) * action_log_probs).mean()


            optimizer.zero_grad()
            critic_optim.zero_grad()
            (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()
            critic_optim.step()
        '''
        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        value_loss.data.cpu().numpy()[0],
                        policy_loss.data.cpu().numpy()[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
예제 #4
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    # logger = Logger(environment_name = args.env_name, entropy_coff= 'entropy_coeff_' + str(args.entropy_coef), folder = args.folder)
    # logger.save_args(args)

    # print ("---------------------------------------")
    # print ('Saving to', logger.save_folder)
    # print ("---------------------------------------")

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    ### for the number of processes to use
    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)
    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    ## ALE Environments : mostly has Discrete action_space type
    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    ### shape==3 for ALE Environments : States are 3D (Image Pi)
    if len(envs.observation_space.shape) == 3:
        actor = Actor(obs_shape[0], envs.action_space, args.recurrent_policy,
                      envs.action_space.n)
        target_actor = Actor(obs_shape[0], envs.action_space,
                             args.recurrent_policy, envs.action_space.n)
        critic = Critic(in_channels=4, num_actions=envs.action_space.n)
        critic_target = Critic(in_channels=4, num_actions=envs.action_space.n)
        baseline_target = Baseline_Critic(in_channels=4,
                                          num_actions=envs.action_space.n)

    if args.cuda:
        actor.cuda()
        critic.cuda()
        critic_target.cuda()
        target_actor.cuda()
        baseline_target.cuda()

    actor_optim = optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic_optim = optim.Adam(critic.parameters(), lr=args.critic_lr)
    baseline_optim = optim.Adam(actor.parameters(), lr=1e-4)
    tau_soft_update = 0.001

    mem_buffer = ReplayBuffer()
    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor.state_size,
                              envs.action_space.n)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()

    for j in range(num_updates):

        temperature = 1.0

        ## num_steps = 5 as in A2C
        for step in range(args.num_steps):
            temperature = temperature / (step + 1)
            # Sample actions
            action, action_log_prob, states, dist_entropy = actor.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True), temperature,
                envs.action_space.n, args.num_processes)

            value = critic.forward(
                Variable(rollouts.observations[step], volatile=True),
                action_log_prob)

            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            pre_state = rollouts.observations[step].cpu().numpy()
            update_current_obs(obs)

            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, dist_entropy.data,
                            value.data, reward, masks)

        nth_step_return = rollouts.returns[0].cpu().numpy()
        current_state = rollouts.observations[0].cpu().numpy()
        nth_state = rollouts.observations[-1].cpu().numpy()
        current_action = rollouts.action_log_probs[0].cpu().numpy()
        current_action_dist_entropy = rollouts.dist_entropy[0].cpu().numpy()

        mem_buffer.add((current_state, nth_state, current_action,
                        nth_step_return, done, current_action_dist_entropy))
        action, action_log_prob, states, dist_entropy = actor.act(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True), temperature,
            envs.action_space.n, args.num_processes)  #[0].data

        next_value = critic.forward(
            Variable(rollouts.observations[-1], volatile=True),
            action_log_prob).data
        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        bs_size = args.batch_size
        if len(mem_buffer.storage) >= bs_size:
            ##samples from the replay buffer
            state, next_state, action, returns, done, entropy_log_prob = mem_buffer.sample(
                bs_size)

            next_state = next_state.reshape([-1, *obs_shape])
            state = state.reshape([-1, *obs_shape])
            action = action.reshape([-1, envs.action_space.n])

            #current Q estimate
            q_batch = critic(to_tensor(state), to_tensor(action))
            # target Q estimate
            next_state_action_probs = target_actor(
                to_tensor(next_state, volatile=True),
                to_tensor(next_state, volatile=True),
                to_tensor(next_state, volatile=True))

            next_q_values = critic_target(to_tensor(next_state, volatile=True),
                                          next_state_action_probs[1])
            next_q_values.volatile = False
            target_q_batch = to_tensor(returns) + args.gamma * to_tensor(
                done.astype(np.float)) * next_q_values

            critic.zero_grad()
            value_loss = criterion(q_batch, target_q_batch)

            if args.gradient_penalty == True:
                gradients = torch.autograd.grad(value_loss,
                                                critic.parameters(),
                                                allow_unused=True,
                                                retain_graph=True,
                                                create_graph=True,
                                                only_inputs=True)[0]
                gradient_penalty = ((gradients.norm(2, dim=1) - 1)**
                                    2).mean() * args.lambda_grad_penalty
                gradient_penalty.backward()

            else:
                value_loss = criterion(q_batch, target_q_batch)
                value_loss.backward()

            critic_optim.step()

            actor.zero_grad()
            policy_loss = -critic(
                to_tensor(state),
                actor(to_tensor(state), to_tensor(state), to_tensor(state))[0])

            ### Soft trust region constraint for the actor
            current_action_probs = actor(to_tensor(state, volatile=False),
                                         to_tensor(state, volatile=False),
                                         to_tensor(state, volatile=False))[0]
            target_action_probs = target_actor(to_tensor(state, volatile=True),
                                               to_tensor(state, volatile=True),
                                               to_tensor(state,
                                                         volatile=True))[0]

            policy_regularizer = criterion(current_action_probs,
                                           target_action_probs)

            ## Actor update with entropy penalty
            policy_loss = policy_loss.mean() - args.entropy_coef * Variable(torch.from_numpy(np.expand_dims(entropy_log_prob.mean(), axis=0))).cuda() \
                            + args.actor_kl_lambda * policy_regularizer

            if args.actor_several_updates == True:
                for p in range(args.actor_updates):
                    policy_loss.backward(retain_variables=True)
            else:
                policy_loss.backward()

            ##clipping of gradient norms
            gradient_norms = nn.utils.clip_grad_norm(actor.parameters(),
                                                     args.max_grad_norm)
            print("gradient_norms", gradient_norms)
            actor_optim.step()

            if args.second_order_grads == True:
                """
                Training the Baseline critic (f(s, \mu(s)))
                """
                baseline_target.zero_grad()
                ## f(s, \mu(s))
                current_baseline = baseline_target(
                    to_tensor(state),
                    actor(to_tensor(state), to_tensor(state),
                          to_tensor(state))[0])

                ## \grad f(s,a)
                grad_baseline_params = torch.autograd.grad(
                    current_baseline.mean(),
                    actor.parameters(),
                    retain_graph=True,
                    create_graph=True)

                ## MSE : (Q - f)^{2}
                baseline_loss = (q_batch.detach() -
                                 current_baseline).pow(2).mean()
                # baseline_loss.volatile=True

                actor.zero_grad()
                baseline_target.zero_grad()
                grad_norm = 0
                for grad_1, grad_2 in zip(grad_params, grad_baseline_params):
                    grad_norm += grad_1.data.pow(2).sum() - grad_2.pow(2).sum()
                grad_norm = grad_norm.sqrt()

                ##Loss for the Baseline approximator (f)
                overall_loss = baseline_loss + args.lambda_second_order_grads * grad_norm
                overall_loss.backward()
                baseline_optim.step()

            soft_update(target_actor, actor, tau_soft_update)
            soft_update(critic_target, critic, tau_soft_update)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "" and len(
                mem_buffer.storage) >= bs_size:
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor
            if args.cuda:
                save_model = copy.deepcopy(actor).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(mem_buffer.storage) >= bs_size:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}, Entropy {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        value_loss.data.cpu().numpy()[0],
                        policy_loss.data.cpu().numpy()[0],
                        entropy_log_prob.mean()))

            final_rewards_mean = [final_rewards.mean()]
            final_rewards_median = [final_rewards.median()]
            final_rewards_min = [final_rewards.min()]
            final_rewards_max = [final_rewards.max()]

            all_value_loss = [value_loss.data.cpu().numpy()[0]]
            all_policy_loss = [policy_loss.data.cpu().numpy()[0]]

            # logger.record_data(final_rewards_mean, final_rewards_median, final_rewards_min, final_rewards_max, all_value_loss, all_policy_loss)
            # # logger.save()

        if args.vis and j % args.vis_interval == 0:

            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass