예제 #1
0
    def episode_finished(self, episode_number):
        all_actions = torch.stack(self.actions,
                                  dim=0).to(self.train_device).squeeze(-1)
        all_rewards = torch.stack(self.rewards,
                                  dim=0).to(self.train_device).squeeze(-1)
        all_values = torch.stack(self.values,
                                 dim=0).to(self.train_device).squeeze(-1)

        self.values, self.actions, self.rewards = [], [], []
        discounted_rewards = discount_rewards(all_rewards, self.gamma)

        error = discounted_rewards - all_values
        error -= torch.mean(error)
        error /= torch.std(error.detach())

        self.optimizer_p.zero_grad()
        self.optimizer_v.zero_grad()

        p_loss = (error.detach() * all_actions).sum()
        c_loss = error.pow(2).mean()

        p_loss.backward()
        c_loss.backward()

        self.optimizer_p.step()
        self.optimizer_v.step()

        self.episode_number += 1
예제 #2
0
    def episode_finished(self, episode_number):
        action_probs = torch.stack(self.action_probs, dim=0) \
                .to(self.train_device).squeeze(-1)
        rewards = torch.stack(self.rewards,
                              dim=0).to(self.train_device).squeeze(-1)
        self.states, self.action_probs, self.rewards = [], [], []

        # TODO: Compute discounted rewards (use the discount_rewards function)
        discounted_rewards = discount_rewards(rewards, self.gamma)
        #Task1.3
        discounted_rewards -= torch.mean(discounted_rewards)
        discounted_rewards /= torch.std(discounted_rewards)

        # TODO: Compute critic loss and advantages (T3)

        # TODO: Compute the optimization term (T1, T3)
        T = len(rewards)
        gammas = torch.tensor([self.gamma**t
                               for t in range(T)]).to(self.train_device)
        #baseline=20(Task 1b)
        #optim = -gammas*(discounted_rewards-20)*action_probs
        optim = -gammas * discounted_rewards * action_probs
        loss = optim.sum()
        loss.backward()

        # TODO: Compute the gradients of loss w.r.t. network parameters (T1)

        # TODO: Update network parameters using self.optimizer and zero gradients (T1)
        self.optimizer.step()
        self.optimizer.zero_grad()
예제 #3
0
    def episode_finished(self, episode_number):
        action_probs = torch.stack(self.action_probs, dim=0) \
                .to(self.train_device).squeeze(-1)
        rewards = torch.stack(self.rewards,
                              dim=0).to(self.train_device).squeeze(-1)
        state_values = torch.stack(self.state_values,
                                   dim=0).to(self.train_device).squeeze(-1)
        self.states, self.action_probs, self.rewards, self.state_values = [], [], [], []

        # Compute discounted rewards (use the discount_rewards function)
        discounted_rewards = discount_rewards(rewards, self.gamma)
        discounted_rewards = (discounted_rewards -
                              torch.mean(discounted_rewards)) / torch.std(
                                  discounted_rewards)  # T1c

        # Compute critic loss and advantages (T3)
        loss = 0
        for log_prob, value, reward in zip(action_probs, state_values,
                                           discounted_rewards):
            advantage = reward - value.item()
            policy_loss = -advantage * log_prob
            value_loss = F.smooth_l1_loss(value, reward)  # Using loss l1
            loss += (policy_loss + value_loss)

        # Compute the optimization term (T1, T3)
        #loss = policy_losses.sum() + value_losses.sum()
        #  Compute the gradients of loss w.r.t. network parameters (T1)
        loss.backward()
        #  Update network parameters using self.optimizer and zero gradients (T1)
        self.optimizer.step()
        self.optimizer.zero_grad()
예제 #4
0
    def episode_finished(self, episode_number):
        # Task 2a: update sigma of the policy exponentially decreasingly.
        # self.policy.update_sigma_exponentially(episode_number + 1)

        action_probs = torch.stack(self.action_probs, dim=0) \
            .to(self.train_device).squeeze(-1)
        rewards = torch.stack(self.rewards,
                              dim=0).to(self.train_device).squeeze(-1)
        self.states, self.action_probs, self.rewards = [], [], []

        # DONE: Compute discounted rewards (use the discount_rewards function)
        discounted_rewards = discount_rewards(rewards, self.gamma)

        # Task 1c
        discounted_rewards -= torch.mean(discounted_rewards)
        discounted_rewards /= torch.std(discounted_rewards)

        # DONE: Compute the optimization term (T1)
        # task 1a
        baseline = 0
        # task 1b
        # baseline = 20

        weighted_probs = -action_probs * (discounted_rewards - baseline)

        # DONE: Compute the gradients of loss w.r.t. network parameters (T1)
        loss = torch.mean(weighted_probs)
        loss.backward()

        # DONE: Update network parameters using self.optimizer and zero gradients (T1)
        self.optimizer.step()
        self.optimizer.zero_grad()
예제 #5
0
    def episode_finished(self):
        action_probs = torch.stack(self.action_probs, dim=0) \
                .to(self.train_device).squeeze(-1)
        rewards = torch.stack(self.rewards,
                              dim=0).to(self.train_device).squeeze(-1)
        self.states, self.action_probs, self.rewards = [], [], []

        # TODO: Compute discounted rewards (use the discount_rewards function)
        discounted_rewards = discount_rewards(
            rewards, self.gamma)  # computing the discounted reward
        # normalize the discounted rewards task 1.c
        discounted_rewards -= torch.mean(discounted_rewards)
        discounted_rewards /= torch.std(discounted_rewards)

        # TODO: Compute critic loss and advantages (T3)

        # TODO: Compute the optimization term (T1, T3)
        #weighted_probs = -action_probs * (discounted_rewards -20) # with baseline task 1.b
        weighted_probs = -action_probs * (discounted_rewards
                                          )  # without baseline

        # TODO: Compute the gradients of loss w.r.t. network parameters (T1)
        #computing the loss
        loss = torch.mean(weighted_probs)
        loss.backward()

        # TODO: Update network parameters using self.optimizer and zero gradients (T1)
        self.optimizer.step()
        self.optimizer.zero_grad()
예제 #6
0
    def episode_finished(self, episode_number):
        action_probs = torch.stack(self.action_probs, dim=0) \
                .to(self.train_device).squeeze(-1)
        rewards = torch.stack(self.rewards,
                              dim=0).to(self.train_device).squeeze(-1)
        self.states, self.action_probs, self.rewards = [], [], []
        # b = 20 # T1b baseline

        # T2a
        # self.policy.sigma = self.policy.sigma_init*np.exp(-0.0005*episode_number)

        # DONE: Compute discounted rewards (use the discount_rewards function)
        discounted_rewards = discount_rewards(rewards, self.gamma)
        discounted_rewards -= torch.mean(discounted_rewards)  # T1c
        discounted_rewards /= torch.std(discounted_rewards)  # T1c

        weighted_probs = -action_probs * discounted_rewards  # T1a, T1c, T2
        # weighted_probs = -action_probs * (discounted_rewards - b) # T1b

        # DONE: Compute the optimization term (T1)
        loss = torch.mean(weighted_probs)

        # DONE: Compute the gradients of loss w.r.t. network parameters (T1)
        loss.backward()

        # DONE: Update network parameters using self.optimizer and zero gradients (T1)
        self.optimizer.step()
        self.optimizer.zero_grad()
    def __init__(self, sess, state_dim, action_dim, learning_rate):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.learning_rate = learning_rate

        # Actor Network
        self.inputs, self.out = self.create_actor_network()
        network_params = tf.trainable_variables()

        # This returns will be provided by the Discount Reward
        self._current_val = tf.placeholder("float", [None, 1],
                                           name='current_val')
        self._returns = tf.placeholder("float", [None, 1], name='returns')
        self.actions = tf.placeholder("float", [None, self.a_dim],
                                      name='actions')

        self._discounted_returns = utils.discount_rewards(self._returns)

        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.action_prob = tf.reduce_sum(self.actions * self.out,
                                         reduction_indices=1)
        self.loss = -tf.log(self.action_prob) * (self._discounted_returns -
                                                 self._current_val)
        #self.optimize = optimizer.minimize(self.loss)
        grads_and_vars = optimizer.compute_gradients(self.loss, network_params)
        self.optimize = optimizer.apply_gradients(grads_and_vars)
예제 #8
0
    def episode_finished(self, episode_number):
        action_probs = torch.stack(self.action_probs, dim=0) \
                .to(self.train_device).squeeze(-1)
        rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1)
        self.states, self.action_probs, self.rewards = [], [], []

        # TODO: Update policy variance (T2) -- DONE
        c = 5e-4
        # self.variance = self.policy.sigma * np.exp(-c * episode_number)  # Exponentially decaying variance

        # TODO: Compute discounted rewards (use the discount_rewards function) -- DONE
        rewards = discount_rewards(rewards, gamma=self.gamma)
        rewards = (rewards - torch.mean(rewards))/torch.std(rewards)  # REINFORCE with normalized rewards

        # TODO: Compute critic loss and advantages (T3)

        # TODO: Compute the optimization term (T1, T3) -- DONE
        loss = torch.sum(-rewards * action_probs)  # REINFORCE
        # loss = torch.sum(-(rewards - self.baseline) * action_probs)  # REINFORCE with baseline

        # TODO: Compute the gradients of loss w.r.t. network parameters (T1) -- DONE
        loss.backward()

        # TODO: Update network parameters using self.optimizer and zero gradients (T1) -- DONE
        self.optimizer.step()
        self.optimizer.zero_grad()
예제 #9
0
    def learn(self):
        """Run learning algorithm"""
        reporter = Reporter()
        config = self.config
        total_n_trajectories = np.zeros(len(self.envs))
        for iteration in range(config["n_iter"]):
            self.session.run([self.reset_accum_grads])
            for i, learner in enumerate(self.task_learners):
                # Collect trajectories until we get timesteps_per_batch total timesteps
                trajectories = learner.get_trajectories()
                total_n_trajectories[i] += len(trajectories)
                all_state = np.concatenate(
                    [trajectory["state"] for trajectory in trajectories])
                # Compute discounted sums of rewards
                rets = [
                    discount_rewards(trajectory["reward"], config["gamma"])
                    for trajectory in trajectories
                ]
                max_len = max(len(ret) for ret in rets)
                padded_rets = [
                    np.concatenate([ret, np.zeros(max_len - len(ret))])
                    for ret in rets
                ]
                # Compute time-dependent baseline
                baseline = np.mean(padded_rets, axis=0)
                # Compute advantage function
                advs = [ret - baseline[:len(ret)] for ret in rets]
                all_action = np.concatenate(
                    [trajectory["action"] for trajectory in trajectories])
                all_adv = np.concatenate(advs)
                # Do policy gradient update step
                episode_rewards = np.array([
                    trajectory["reward"].sum() for trajectory in trajectories
                ])  # episode total rewards
                episode_lengths = np.array([
                    len(trajectory["reward"]) for trajectory in trajectories
                ])  # episode lengths
                self.session.run(
                    [self.add_accum_grads[i]],
                    feed_dict={
                        self.state: all_state,
                        self.action_taken: all_action,
                        self.advantage: all_adv
                    })
                # summary = self.session.run([self.master.summary_op], feed_dict={
                #     self.reward: reward
                #     # self.master.episode_length: trajectory["steps"]
                # })

                # self.writer.add_summary(summary[0], iteration)
                # self.writer.flush()
                print("Task:", i)
                reporter.print_iteration_stats(iteration, episode_rewards,
                                               episode_lengths,
                                               total_n_trajectories[i])

            # Apply accumulated gradient after all the gradients of each task are summed
            self.session.run([self.apply_gradients])
예제 #10
0
    def rewards_discounted(self):
        """Compute the discounted reward backwards through time."""

        reward_his = discount_rewards(self.rewards)
        # standardize the rewards to be unit normal
        # (helps control the gradient estimator variance)
        reward_his -= np.mean(reward_his)
        tmp = np.std(reward_his)
        if tmp > 0:
            reward_his /= tmp  # fix zero-divide

        return reward_his
예제 #11
0
 def run(self):
     # Assume global shared parameter vectors θ and θv and global shared counter T = 0
     # Assume thread-specific parameter vectors θ' and θ'v
     sess = self.master.session
     t = 1  # thread step counter
     while self.master.T < self.master.config[
             'T_max'] and not self.master.stop_requested:
         # Reset gradients: dθ = 0 and dθv = 0
         sess.run([self.actor_reset_ag, self.critic_reset_ag])
         # Synchronize thread-specific parameters θ' = θ and θ'v = θv
         sess.run([self.actor_sync_net, self.critic_sync_net])
         trajectory = self.get_trajectory(
             self.master.config['episode_max_length'])
         reward = sum(trajectory['reward'])
         trajectory['reward'][-1] = 0 if trajectory[
             'done'] else self.get_critic_value(trajectory['state'][None,
                                                                    -1])[0]
         returns = discount_rewards(trajectory['reward'],
                                    self.master.config['gamma'])
         fetches = [
             self.actor_net.summary_loss, self.critic_net.summary_loss,
             self.actor_add_ag, self.critic_add_ag, self.master.global_step
         ]  # What does the master global step thing do?
         ac_net = self.actor_net
         cr_net = self.critic_net
         qw_new = self.master.session.run(
             [cr_net.value], feed_dict={cr_net.state:
                                        trajectory['state']})[0].flatten()
         all_action = self.transform_actions(
             trajectory['action']
         )  # Transform actions back to the output shape of the actor network (e.g. one-hot for discrete action space)
         results = sess.run(fetches,
                            feed_dict={
                                ac_net.state: trajectory["state"],
                                cr_net.state: trajectory["state"],
                                ac_net.actions_taken: all_action,
                                ac_net.critic_feedback: qw_new,
                                ac_net.critic_rewards: returns,
                                cr_net.target: returns.reshape(-1, 1)
                            })
         summary = sess.run(
             [self.master.summary_op],
             feed_dict={
                 self.master.actor_loss: results[0],
                 self.master.critic_loss: results[1],
                 self.master.reward: reward,
                 self.master.episode_length: trajectory["steps"]
             })
         self.writer.add_summary(summary[0], t)
         self.writer.flush()
         sess.run([self.apply_actor_gradients, self.apply_critic_gradients])
         t += 1
         self.master.T += trajectory['steps']
예제 #12
0
    def learn_REINFORCE(self):
        """Learn using updates like in the REINFORCE algorithm."""
        reporter = Reporter()
        config = self.master.config
        total_n_trajectories = 0
        iteration = 0
        while iteration < config["n_iter"] and not self.master.stop_requested:
            iteration += 1
            self.master.session.run([self.master.reset_accum_grads])
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectories = self.task_learner.get_trajectories()
            total_n_trajectories += len(trajectories)
            all_state = np.concatenate(
                [trajectory["state"] for trajectory in trajectories])
            # Compute discounted sums of rewards
            rets = [
                discount_rewards(trajectory["reward"], config["gamma"])
                for trajectory in trajectories
            ]
            max_len = max(len(ret) for ret in rets)
            padded_rets = [
                np.concatenate([ret, np.zeros(max_len - len(ret))])
                for ret in rets
            ]
            # Compute time-dependent baseline
            baseline = np.mean(padded_rets, axis=0)
            # Compute advantage function
            advs = [ret - baseline[:len(ret)] for ret in rets]
            all_action = np.concatenate(
                [trajectory["action"] for trajectory in trajectories])
            all_adv = np.concatenate(advs)
            # Do policy gradient update step
            episode_rewards = np.array([
                trajectory["reward"].sum() for trajectory in trajectories
            ])  # episode total rewards
            episode_lengths = np.array([
                len(trajectory["reward"]) for trajectory in trajectories
            ])  # episode lengths
            self.master.session.run(
                [self.add_accum_grad],
                feed_dict={
                    self.master.state: all_state,
                    self.master.action_taken: all_action,
                    self.master.advantage: all_adv
                })
            print("Task:", self.thread_id)
            reporter.print_iteration_stats(iteration, episode_rewards,
                                           episode_lengths,
                                           total_n_trajectories)

            self.master.session.run([self.master.apply_gradients])
예제 #13
0
    def learn(self):
        """Run learning algorithm"""
        reporter = Reporter()
        config = self.config
        total_n_trajectories = 0
        for iteration in range(config["n_iter"]):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectories = self.get_trajectories()
            total_n_trajectories += len(trajectories)
            all_state = np.concatenate(
                [trajectory["state"] for trajectory in trajectories])
            # Compute discounted sums of rewards
            rets = [
                discount_rewards(trajectory["reward"], config["gamma"])
                for trajectory in trajectories
            ]
            max_len = max(len(ret) for ret in rets)
            padded_rets = [
                np.concatenate([ret, np.zeros(max_len - len(ret))])
                for ret in rets
            ]
            # Compute time-dependent baseline
            baseline = np.mean(padded_rets, axis=0)
            # Compute advantage function
            advs = [ret - baseline[:len(ret)] for ret in rets]
            all_action = np.concatenate(
                [trajectory["action"] for trajectory in trajectories])
            all_adv = np.concatenate(advs)
            # Do policy gradient update step
            episode_rewards = np.array([
                trajectory["reward"].sum() for trajectory in trajectories
            ])  # episode total rewards
            episode_lengths = np.array([
                len(trajectory["reward"]) for trajectory in trajectories
            ])  # episode lengths
            result = self.session.run(
                [self.summary_op, self.train],
                feed_dict={
                    self.state: all_state,
                    self.a_n: all_action,
                    self.adv_n: all_adv,
                    self.episode_lengths: np.mean(episode_lengths),
                    self.rewards: np.mean(episode_rewards)
                })
            self.writer.add_summary(result[0], iteration)
            self.writer.flush()

            reporter.print_iteration_stats(iteration, episode_rewards,
                                           episode_lengths,
                                           total_n_trajectories)
예제 #14
0
    def learn(self, env):
        reporter = Reporter()

        self.session.run([self.reset_accumulative_grads])

        iteration = 0  # amount of batches processed
        episode_nr = 0
        episode_lengths = np.zeros(self.config['batch_size'])
        episode_rewards = np.zeros(self.config['batch_size'])
        mean_rewards = []
        while True:  # Keep executing episodes
            trajectory = self.get_trajectory(env,
                                             self.config["episode_max_length"])

            episode_rewards[episode_nr % self.config['batch_size']] = sum(
                trajectory['reward'])
            episode_lengths[episode_nr % self.config['batch_size']] = len(
                trajectory['reward'])
            episode_nr += 1
            action_taken = (np.arange(
                self.nA) == trajectory['action'][:, None]).astype(
                    np.float32)  # one-hot encoding

            discounted_episode_rewards = discount_rewards(
                trajectory['reward'], self.config['gamma'])
            # standardize
            discounted_episode_rewards -= np.mean(discounted_episode_rewards)
            std = np.std(discounted_episode_rewards)
            std = std if std > 0 else 1
            discounted_episode_rewards /= std
            feedback = np.reshape(
                np.repeat(discounted_episode_rewards, self.nA),
                (len(discounted_episode_rewards), self.nA))

            self.session.run(
                [self.accumulate_grads],
                feed_dict={
                    self.state: trajectory["state"],
                    self.action_taken: action_taken,
                    self.feedback: feedback
                })
            if episode_nr % self.config['batch_size'] == 0:  # batch is done
                iteration += 1
                self.session.run([self.apply_gradients])
                self.session.run([self.reset_accumulative_grads])
                reporter.print_iteration_stats(iteration, episode_rewards,
                                               episode_lengths, episode_nr)
                mean_rewards.append(episode_rewards.mean())
                if episode_nr % self.config['draw_frequency'] == 0:
                    reporter.draw_rewards(mean_rewards)
예제 #15
0
    def episode_finished(self, episode_number):
        all_actions = torch.stack(self.actions, dim=0).to(self.train_device).squeeze(-1)
        all_rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1)
        self.observations, self.actions, self.rewards = [], [], []
        discounted_rewards = discount_rewards(all_rewards, self.gamma)
        discounted_rewards -= torch.mean(discounted_rewards)
        discounted_rewards /= torch.std(discounted_rewards)

        weighted_probs = all_actions * discounted_rewards
        loss = torch.mean(weighted_probs)
        loss.backward()

        if (episode_number+1) % self.batch_size == 0:
            self.update_policy()
예제 #16
0
    def learn(self):
        reporter = Reporter()

        gradient1 = np.zeros_like(self.w1)
        gradient2 = np.zeros_like(self.w2)

        rmsprop1 = np.zeros_like(self.w1)
        rmsprop2 = np.zeros_like(self.w2)

        iteration = 0  # amount of batches processed
        episode_nr = 0
        episode_lengths = np.zeros(self.config['batch_size'])
        episode_rewards = np.zeros(self.config['batch_size'])
        mean_rewards = []
        while True:  # Keep executing episodes
            trajectory = self.get_trajectory(self.config["episode_max_length"])

            episode_rewards[episode_nr % self.config['batch_size']] = sum(trajectory['reward'])
            episode_lengths[episode_nr % self.config['batch_size']] = len(trajectory['reward'])
            episode_nr += 1
            action_taken = (np.arange(self.nA) == trajectory['action'][:, None]).astype(np.float32)  # one-hot encoding
            epdlogp = action_taken - trajectory['prob']

            # episode_states = np.vstack(encountered_states)

            discounted_episode_rewards = discount_rewards(trajectory['reward'], self.config['gamma'])
            # print(discounted_episode_rewards)
            # standardize
            discounted_episode_rewards -= np.mean(discounted_episode_rewards)
            discounted_episode_rewards /= np.std(discounted_episode_rewards)
            epdlogp *= np.reshape(np.repeat(discounted_episode_rewards, self.nA), (len(discounted_episode_rewards), self.nA))

            change_w1, change_w2 = self.backward_step(trajectory['state'], trajectory['x1'], epdlogp)

            gradient1 += change_w1
            gradient2 += change_w2

            if episode_nr % self.config['batch_size'] == 0:  # batch is done
                iteration += 1
                rmsprop1 = self.config['decay_rate'] * rmsprop1 + (1 - self.config['decay_rate']) * gradient1**2
                rmsprop2 = self.config['decay_rate'] * rmsprop2 + (1 - self.config['decay_rate']) * gradient2**2
                self.w1 += self.config['learning_rate'] * gradient1 / (np.sqrt(rmsprop1) + 1e-5)
                self.w2 += self.config['learning_rate'] * gradient2 / (np.sqrt(rmsprop2) + 1e-5)
                gradient1 = np.zeros_like(self.w1)
                gradient2 = np.zeros_like(self.w2)
                reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, episode_nr)
                mean_rewards.append(episode_rewards.mean())
                if episode_nr % self.config['draw_frequency'] == 0:
                    reporter.draw_rewards(mean_rewards)
예제 #17
0
    def learn(self):
        """Run learning algorithm"""
        reporter = Reporter()
        config = self.config
        possible_actions = np.arange(self.nA)
        total_n_trajectories = 0
        for iteration in range(config["n_iter"]):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectories = self.get_trajectories()
            total_n_trajectories += len(trajectories)
            all_action = np.concatenate(
                [trajectory["action"] for trajectory in trajectories])
            all_action = (possible_actions == all_action[:, None]).astype(
                np.float32)
            all_state = np.concatenate(
                [trajectory["state"] for trajectory in trajectories])
            # Compute discounted sums of rewards
            returns = np.concatenate([
                discount_rewards(trajectory["reward"], config["gamma"])
                for trajectory in trajectories
            ])
            qw_new = self.get_critic_value(all_state)

            episode_rewards = np.array([
                trajectory["reward"].sum() for trajectory in trajectories
            ])  # episode total rewards
            episode_lengths = np.array([
                len(trajectory["reward"]) for trajectory in trajectories
            ])  # episode lengths

            results = self.sess.run(
                [self.summary_op, self.critic_train, self.actor_train],
                feed_dict={
                    self.critic_state_in: all_state,
                    self.critic_target: returns,
                    self.actor_input: all_state,
                    self.actions_taken: all_action,
                    self.critic_feedback: qw_new,
                    self.critic_rewards: returns,
                    self.rewards: np.mean(episode_rewards),
                    self.episode_lengths: np.mean(episode_lengths)
                })
            self.writer.add_summary(results[0], iteration)
            self.writer.flush()

            reporter.print_iteration_stats(iteration, episode_rewards,
                                           episode_lengths,
                                           total_n_trajectories)
예제 #18
0
    def update_policy(self, episode_number):
        # Convert buffers to Torch tensors
        action_probs = torch.stack(self.action_probs, dim=0) \
            .to(self.train_device).squeeze(-1)
        rewards = torch.stack(self.rewards,
                              dim=0).to(self.train_device).squeeze(-1)
        states = torch.stack(self.states,
                             dim=0).to(self.train_device).squeeze(-1)
        next_states = torch.stack(self.next_states,
                                  dim=0).to(self.train_device).squeeze(-1)
        done = torch.Tensor(self.done).to(self.train_device)
        # Clear state transition buffers
        self.states, self.action_probs, self.rewards = [], [], []
        self.next_states, self.done = [], []

        # DONE: Compute state values
        state_values = torch.stack(
            [self.policy.forward(state)[1][0] for state in states])
        next_state_values = torch.stack(
            [self.policy.forward(state)[1][0] for state in next_states])

        # DONE: Compute critic loss (MSE)
        discounted_rewards = discount_rewards(rewards, self.gamma)

        # Normalize discounted rewards.
        discounted_rewards -= torch.mean(discounted_rewards)
        discounted_rewards /= torch.std(discounted_rewards)

        mse_loss = torch.nn.MSELoss()
        critic_loss = mse_loss(state_values, discounted_rewards)

        # Advantage estimates
        # DONE: Compute advantage estimates
        advantages = rewards + self.gamma * next_state_values - state_values

        # DONE: Calculate actor loss (very similar to PG)
        weighted_probs = -action_probs * advantages.detach()
        actor_loss = torch.mean(weighted_probs)

        # DONE: Compute the gradients of loss w.r.t. network parameters
        # Or copy from Ex5
        loss = actor_loss + critic_loss
        loss.backward()

        # DONE: Update network parameters using self.optimizer and zero gradients
        # Or copy from Ex5
        self.optimizer.step()
        self.optimizer.zero_grad()
    def episode_finished(self, episode_number):
        action_probs = torch.stack(self.action_probs, dim=0) \
                .to(self.train_device).squeeze(-1)
        rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1)
        self.states, self.action_probs, self.rewards = [], [], []

        G = discount_rewards(rewards,self.gamma)
        if self.normalize:
            G = ((G-G.mean())/(G.std() + 1e-6))
        
        optimizer_terms = -(G-self.baseline)*action_probs
        loss = optimizer_terms.sum()
        loss.backward()
        # TODO: Update network parameters using self.optimizer and zero gradients (T1)
        self.optimizer.step()
        self.optimizer.zero_grad()
예제 #20
0
    def episode_finished(self, episode_number):
        all_actions = torch.stack(self.actions,
                                  dim=0).to(self.train_device).squeeze(-1)
        all_rewards = torch.stack(self.rewards,
                                  dim=0).to(self.train_device).squeeze(-1)
        self.observations, self.actions, self.rewards = [], [], []
        discounted_rewards = discount_rewards(all_rewards, self.gamma)
        discounted_rewards -= torch.mean(discounted_rewards)
        discounted_rewards /= torch.std(discounted_rewards)

        weighted_probs = all_actions * discounted_rewards
        loss = torch.sum(weighted_probs)
        loss.backward()

        # Update policy
        self.optimizer.step()
        self.optimizer.zero_grad()
    def __init__(self, sess, state_dim, learning_rate):
        self.sess = sess
        self.s_dim = state_dim
        self.learning_rate = learning_rate

        # Critic Network
        self.inputs, self._out = self.create_actor_network()

        # This returns will be provided by the Discount Reward
        self.returns = tf.placeholder("float", [None, 1], name='returns')

        # tf reward processing
        self._discounted_returns = utils.discount_rewards(self.returns)

        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self._loss = tf.nn.l2_loss(self._out - self._discounted_returns)
        self.optimize = optimizer.minimize(self._loss)
예제 #22
0
    def learn(self):
        """Run learning algorithm"""
        reporter = Reporter()
        config = self.config
        total_n_trajectories = 0
        for iteration in range(config["n_iter"]):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectories = self.get_trajectories(self.env)
            total_n_trajectories += len(trajectories)
            all_action = np.concatenate(
                [trajectory["action"] for trajectory in trajectories])
            all_ob = np.concatenate(
                [trajectory["ob"] for trajectory in trajectories])
            # Compute discounted sums of rewards
            returns = np.concatenate([
                discount_rewards(trajectory["reward"], config["gamma"])
                for trajectory in trajectories
            ])
            qw_new = self.get_critic_value(all_ob)

            print(qw_new)
            self.sess.run(
                [self.critic_train],
                feed_dict={
                    self.critic_state_in: all_ob,
                    self.critic_target: returns.reshape(-1, 1)
                })
            target = np.mean((returns - qw_new)**2)
            self.sess.run(
                [self.actor_train],
                feed_dict={
                    self.input_state: all_ob,
                    self.actions_taken: all_action,
                    self.target: target
                })

            episode_rewards = np.array([
                trajectory["reward"].sum() for trajectory in trajectories
            ])  # episode total rewards
            episode_lengths = np.array([
                len(trajectory["reward"]) for trajectory in trajectories
            ])  # episode lengths
            reporter.print_iteration_stats(iteration, episode_rewards,
                                           episode_lengths,
                                           total_n_trajectories)
    def update(self, acts, rews, obs, optimizer):
        rews_disc = U.discount_rewards(rews, self.gamma)
        acts = torch.Tensor(acts)
        rews_disc = torch.Tensor(rews_disc)
        obs = torch.Tensor(obs)
        logits = self.forward(obs)
        probs = F.softmax(logits, dim=1)

        # index logprobs by acts
        logprobs = dists.Categorical(probs=probs).log_prob(acts)

        loss = (-logprobs * rews_disc).mean()
        ent_loss = (-probs * torch.log(probs)).sum(dim=1).mean()
        loss -= self.ent_coeff * ent_loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
예제 #24
0
    def episode_finished(self, done):
        action_probs = torch.stack(self.action_probs, dim=0) \
                .to(self.train_device).squeeze(-1)
        rewards = torch.stack(self.rewards,
                              dim=0).to(self.train_device).squeeze(-1)
        values = torch.stack(self.values, dim=0).to(self.train_device).squeeze(
            -1)  # values from the network
        if done:
            self.states, self.action_probs, self.rewards, self.values = [], [], [], []
        #else:
        #print(done)
        #print("values : "+str(values))
        #print("reward : "+str(rewards))

        # TODO: Compute discounted rewards (use the discount_rewards function)
        discounted_rewards = discount_rewards(rewards, self.gamma)
        #print(discounted_rewards)
        #discounted_rewards = rewards + (self.gamma *  values)

        # TODO: Compute critic loss and advantages (T3)
        if done:
            advantage = discounted_rewards
        else:
            advantage = discounted_rewards - values
        advantage -= torch.mean(advantage)
        advantage /= torch.std(advantage.detach())
        critic_loss = advantage.pow(2).mean()

        # TODO: Compute the optimization term (T1, T3)
        weighted_probs = -action_probs * advantage.detach()
        #loss = weighted_probs.sum()

        actor_loss = weighted_probs.mean()
        ac_loss = actor_loss + critic_loss
        # TODO: Compute the gradients of loss w.r.t. network parameters (T1)

        # TODO: Update network parameters using self.optimizer and zero gradients (T1)
        ac_loss.backward(retain_graph=True)

        self.optimizer.step()
        self.optimizer.zero_grad()

        # sigma implementation
        """ 
예제 #25
0
    def learn_Karpathy(self):
        """Learn using updates like in the Karpathy algorithm."""
        reporter = Reporter()
        config = self.master.config
        self.master.session.run([self.master.reset_accum_grads])

        iteration = 0
        episode_nr = 0
        mean_rewards = []
        while not self.master.stop_requested:  # Keep executing episodes until the master requests a stop (e.g. using SIGINT)
            iteration += 1
            trajectory = self.task_learner.get_trajectory()
            reward = sum(trajectory['reward'])
            action_taken = trajectory['action']

            discounted_episode_rewards = discount_rewards(
                trajectory['reward'], config['gamma'])
            # standardize
            discounted_episode_rewards -= np.mean(discounted_episode_rewards)
            std = np.std(discounted_episode_rewards)
            std = std if std > 0 else 1
            discounted_episode_rewards /= std
            feedback = discounted_episode_rewards

            results = self.master.session.run(
                [self.loss, self.add_accum_grad],
                feed_dict={
                    self.master.state: trajectory["state"],
                    self.master.action_taken: action_taken,
                    self.master.advantage: feedback
                })
            results = self.master.session.run(
                [self.master.summary_op],
                feed_dict={
                    self.master.loss: results[0],
                    self.master.reward: reward,
                    self.master.episode_length: trajectory["steps"]
                })
            self.writer.add_summary(results[0], iteration)
            self.writer.flush()

            self.master.session.run([self.master.apply_gradients])
            self.master.session.run([self.master.reset_accum_grads])
예제 #26
0
	def _make_batch(self, epoch):
		current_policy, current_value, current_oracle = get_current_policy(self.env, self.PGNetwork, self.VNetwork, self.ZNetwork)

		# states = [
		#task1		[[---episode_1---],...,[---episode_n---]],
		#task2		[[---episode_1---],...,[---episode_n---]]
		#			]
		states, tasks, actions, rewards, next_states = self.rollout.rollout_batch(self.PGNetwork, current_policy, epoch)

		discounted_rewards, GAEs = [], []
		for task in range(self.env.num_task):
			discounted_rewards.append([])
			GAEs.append([])
			for ep_state, ep_next, ep_reward in zip(states[task], next_states[task], rewards[task]):	
				discounted_rewards[task] += discount_rewards(self.env, ep_reward, ep_state, ep_next, task, current_value)
				GAEs[task] += GAE(self.env, ep_reward, ep_state, ep_next, task, current_value)
			
			states[task] = np.concatenate(states[task])       
			tasks[task] = np.concatenate(tasks[task])     
			actions[task] = np.concatenate(actions[task])     
			rewards[task] = np.concatenate(rewards[task])
			next_states[task] = np.concatenate(next_states[task])

		state_dict, count_dict = statistic(self.env, states, actions, discounted_rewards, GAEs, next_states, current_value)
		task_states, task_actions, task_target_values, task_advantages, \
		sharing_states, sharing_actions, sharing_advantages = self._process_PV_batch(states,
																			  actions,
																			  discounted_rewards,
																			  GAEs,
																			  next_states,
																			  current_policy,
																			  current_value,
																			  current_oracle,
																			  count_dict)

		z_states, z_actions, z_rewards = self._process_Z_batch(state_dict, count_dict)

		return task_states, task_actions, task_target_values, task_advantages, \
			   sharing_states, sharing_actions, sharing_advantages, \
			   np.concatenate(rewards), \
			   z_states, z_actions, z_rewards
예제 #27
0
    def episode_finished(self, episode_number):
        action_probs = torch.stack(self.action_probs, dim=0) \
                .to(self.train_device).squeeze(-1)
        rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1)
        values = torch.stack(self.values, dim=0).to(self.train_device).squeeze(-1) # values from the network
        done =torch.stack(self.done, dim=0).to(self.train_device).squeeze(-1)
        self.states, self.action_probs, self.rewards, self.values, self.done = [], [], [], []

        for i in range(len(values)):
            td_target = rewards[i] + self.gamma * self.value()
        # TODO: Compute discounted rewards (use the discount_rewards function)
        discounted_rewards = discount_rewards(rewards,self.gamma)
        #discounted_rewards -= torch.mean(discounted_rewards)
        #discounted_rewards /= torch.std(discounted_rewards)


        advantage = discounted_rewards - values 
        advantage -= torch.mean(advantage)
        advantage /= torch.std(advantage.detach())
        
        
        
        # TODO: Compute critic loss and advantages (T3)
        self.value_optimizer.zero_grad()
        self.policy_optimizer.zero_grad()

        # TODO: Compute the optimization term (T1, T3)
        #weighted_probs = -action_probs * (discounted_rewards -20) # with baseline
        weighted_probs = -action_probs * advantage.detach() # without baseline
        policy_l = weighted_probs.sum()
        critic_l = advantage.pow(2).mean()
        # TODO: Compute the gradients of loss w.r.t. network parameters (T1)
        policy_l.backward()
        critic_l.backward()

        # TODO: Update network parameters using self.optimizer and zero gradients (T1)
        self.policy_optimizer.step()
        self.value_optimizer.step()

        # sigma implementation 
        """ 
예제 #28
0
    def episode_finished(self, episode_number):
        action_probs = torch.stack(self.action_probs, dim=0).to(self.train_device).squeeze(-1)
        rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1)
        self.states, self.action_probs, self.rewards = [], [], []

        # TODO: Compute discounted rewards (use the discount_rewards function)
        discounted_r = discount_rewards(rewards, self.gamma)
        #discounted_r -= torch.mean(discounted_r)  # for Task 1 c)
        #discounted_r /= torch.std(discounted_r)

        # TODO: Compute the optimization term (T1)
        #weighted_probs = action_probs * discounted_r  # REINFORCE without baseline # T1 a)+c) & T2 # from exercise 1
        weighted_probs = action_probs * (discounted_r - self.baseline)  # REINFORCE with baseline # T1 b)
        loss = torch.mean((-1) * weighted_probs)  # needs to be multiplied by (-1) because gradient is a maximize function, so we maximize negative loss to converge it towards zero

        # TODO: Compute the gradients of loss w.r.t. network parameters (T1)
        loss.backward()  # like in policy gradient tutorial 2.2 Automatic differentiation

        # TODO: Update network parameters using self.optimizer and zero gradients (T1)
        self.optimizer.step()  # like in policy gradient tutorial 2.3 Using optimizers
        self.optimizer.zero_grad()
예제 #29
0
    def episode_finished(self, episode_number):
        action_probs = torch.stack(self.action_probs, dim=0).to(self.train_device).squeeze(-1)
        rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1)
        self.states, self.action_probs, self.rewards = [], [], []

        #c = 5e-4 #T2
        #self.variance = self.policy.sigma * np.exp(-c * episode_number)

        # TODO: Compute discounted rewards (use the discount_rewards function)
        G = discount_rewards(r=rewards, gamma=self.gamma)
        #normalized rewards
        G = (G - torch.mean(G)) / torch.std(G)

        # TODO: Compute the optimization term (T1)
        loss = torch.sum(-G * action_probs) #basic REINFORCE
        #loss = torch.sum(-(G-self.baseline)*action_probs) #REINFORCE with baseline

        # TODO: Compute the gradients of loss w.r.t. network parameters (T1)
        loss.backward()

        # TODO: Update network parameters using self.optimizer and zero gradients (T1)
        self.optimizer.step()
        self.optimizer.zero_grad()
예제 #30
0
파일: agent.py 프로젝트: fuksi/rl
    def episode_finished(self, episode_number):
        # Save nn every 200th eps
        if episode_number % 200 == 0 and episode_number > 0:
            torch.save(self.policy.state_dict(), 'model-nn.pt')

        # Calc discounted rewards
        all_rewards = torch.stack(self.rewards, dim=0).to(
            self.train_device).squeeze(1)
        discounted_rewards = discount_rewards(all_rewards, self.gamma)
        discounted_rewards -= torch.mean(discounted_rewards)
        discounted_rewards /= torch.std(discounted_rewards)

        # Stack prop_ups, fake_labels, measure losses
        all_actions = torch.stack(self.prop_ups).float().to(
            self.train_device).squeeze(1)
        all_labels = torch.tensor(
            self.fake_labels).float().to(self.train_device)
        losses = self.loss(all_actions, all_labels)
        losses *= discounted_rewards
        loss = torch.mean(losses)

        # Reset
        self.reset()

        # Compute grad
        loss.backward(torch.tensor(1.0/self.batch_size).to(self.train_device))

        # Output loss and rewards every now and then
        reward_sum = sum(all_rewards)
        if episode_number % 10 == 1:
            print(f'Episode: {episode_number}, Loss: {loss}. Rewards: {reward_sum}')

        # Update policy depends on batch size
        if episode_number % self.batch_size == 0 and episode_number > 0:
            self.optimizer.step()
            self.optimizer.zero_grad()