Exemplo n.º 1
0
    def run(self):
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], []
        for n in range(self.nsteps):

            obs = np.float32(self.obs / 255.)

            policy = self.step_actor.predict(obs)

            actions = []
            for i in range(policy.shape[0]):
                action = np.random.choice(self.action_size, 1, p=policy[i])
                actions.append(action)
            actions = np.array(actions)

            values = self.step_critic.predict(obs)
            values = values[:, 0]

            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)
            obs, rewards, dones, _ = self.env.step(actions)

            self.dones = dones
            for n, done in enumerate(dones):
                if done:
                    self.obs[n] = self.obs[n] * 0
            self.update_obs(obs)

            mb_rewards.append(rewards)

        mb_dones.append(self.dones)
        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
            self.batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)

        mb_dones = mb_dones[:, 1:]

        last_values = self.step_critic.predict(self.obs).tolist()

        # discount/bootstrap off value fn
        for n, (rewards, dones,
                value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist()
            dones = dones.tolist()
            if dones[-1] == 0:
                rewards = discount_with_dones(rewards + value, dones + [0],
                                              self.gamma)[:-1]
            else:
                rewards = discount_with_dones(rewards, dones, self.gamma)
            mb_rewards[n] = rewards

        mb_rewards = mb_rewards.flatten()
        mb_actions = mb_actions.flatten()
        mb_values = mb_values.flatten()

        return mb_obs, mb_actions, mb_rewards, mb_values
Exemplo n.º 2
0
    def run_nsteps(self):
        b_observations, b_rewards, b_actions, b_values, b_dones = [], [], [], [], []
        b_states = self.states
        for m in range(self.nsteps):
            time.sleep(0.001)
            # action, ap, value, states, _ = self.model.step([self.obs], [self.done], self.states)
            # action = np.random.choice(np.arange(self.n_actions), p=ap[0])
            # reward = env.act(self.action_set[action])
            a_dist, value, states = self.model.step([self.obs], [self.done],
                                                    self.states)
            action = np.random.choice(np.arange(self.n_actions), p=a_dist[0])
            reward = env.act(self.action_set[action])
            reward += 0.1
            # print('%s %s -- %s' % (ap[0], action, reward)
            if env.game_over():
                done = 1
                obs = list(env.getGameState())
                if abs(self.obs[0] -
                       self.obs[3]) <= 24 or abs(self.obs[0] -
                                                 self.obs[4]) <= 24:
                    reward = -3.  # penalize it less if agent hits tunnel edges
            else:
                done = 0
                obs = list(env.getGameState())
            if reward == 1.:
                print(reward, self.obs)
                print(reward, obs)
            self.states = states
            self.done = done
            b_dones.append(done)
            b_observations.append(np.copy(self.obs))
            b_actions.append(action)
            b_rewards.append(reward)
            b_values.append(value)

            self.obs = obs  # obs = next_obs
            if done:
                break

        self.total_return += reward

        # convert lists to numpy arrays and flatten arrays
        b_observations = np.asarray(b_observations, dtype=np.float32)
        b_rewards = np.asarray(b_rewards, dtype=np.int8).flatten()
        b_actions = np.asarray(b_actions, dtype=np.uint8).flatten()
        b_values = np.asarray(b_values, dtype=np.float32).flatten()
        b_dones = np.asarray(b_dones, dtype=np.int8).flatten()
        next_value = self.model.value([self.obs], [self.done], self.states)
        if b_dones[-1] == 0:
            discounted_rewards = discount_with_dones(
                b_rewards.tolist() + next_value.tolist(),
                b_dones.tolist() + [0], self.discount)[:-1]
        else:
            discounted_rewards = discount_with_dones(b_rewards, b_dones,
                                                     self.discount)
        return b_observations, discounted_rewards, b_actions, b_values, b_dones, b_states
Exemplo n.º 3
0
 def run(self):
     mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_raw_rewards = [], [], [], [], [], []
     mb_states = self.states
     for n in range(self.nsteps):
         actions, values, entropy, states, _ = self.model.step(
             self.obs, self.states, self.dones)
         mb_obs.append(np.copy(self.obs))
         mb_actions.append(actions)
         mb_values.append(values)
         mb_dones.append(self.dones)
         obs_all, raw_rewards, dones, _ = self.env.step(actions)
         obs = [obs_index['image'] for obs_index in obs_all]
         obs = np.asarray(obs)
         #rewards = raw_rewards + entropy * self.entropy_coef
         rewards = raw_rewards
         self.states = states
         self.dones = dones
         for n, done in enumerate(dones):
             if done:
                 self.obs[n] = self.obs[n] * 0
         self.obs = obs
         mb_rewards.append(rewards)
         mb_raw_rewards.append(raw_rewards)
     mb_dones.append(self.dones)
     # batch of steps to batch of rollouts
     mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
         self.batch_ob_shape)
     mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
     mb_raw_rewards = np.asarray(mb_raw_rewards,
                                 dtype=np.float32).swapaxes(1, 0)
     mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
     mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
     mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
     mb_masks = mb_dones[:, :-1]
     mb_dones = mb_dones[:, 1:]
     last_values = self.model.value(self.obs, self.states,
                                    self.dones).tolist()
     # discount/bootstrap off value fn
     for n, (rewards, dones,
             value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
         rewards = rewards.tolist()
         dones = dones.tolist()
         if dones[-1] == 0:
             rewards = discount_with_dones(rewards + [value], dones + [0],
                                           self.gamma)[:-1]
         else:
             rewards = discount_with_dones(rewards, dones, self.gamma)
         mb_rewards[n] = rewards
     mb_rewards = mb_rewards.flatten()
     mb_raw_rewards = mb_raw_rewards.flatten()
     mb_actions = mb_actions.flatten()
     mb_values = mb_values.flatten()
     mb_masks = mb_masks.flatten()
     return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, mb_raw_rewards
Exemplo n.º 4
0
 def run(self):
     mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
     mb_states = self.states
     for n in range(self.nsteps):
         actions, values, states = self.model.step(self.obs, self.states,
                                                   self.dones)
         mb_obs.append(np.copy(self.obs))
         mb_actions.append(actions)
         mb_values.append(values)
         mb_dones.append(self.dones)
         obs, rewards, dones, _ = self.env.step(actions)
         self.states = states
         self.dones = dones
         for n, done in enumerate(dones):
             if done:
                 self.obs[n] = self.obs[n] * 0
         self.update_obs(obs)
         mb_rewards.append(rewards)
     mb_dones.append(self.dones)
     #batch of steps to batch of rollouts
     if self.from_pixels:
         # (nstep, nenv, 100, 150, nstack) before below
         mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
             self.batch_ob_shape)
     else:
         mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(
             1, 0).reshape(self.batch_ob_shape)
     mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
     mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
     mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
     mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
     mb_masks = mb_dones[:, :-1]
     mb_dones = mb_dones[:, 1:]
     last_values = self.model.value(self.obs, self.states,
                                    self.dones).tolist()
     #discount/bootstrap off value fn
     for n, (rewards, dones,
             value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
         rewards = rewards.tolist()
         dones = dones.tolist()
         if dones[-1] == 0:
             rewards = discount_with_dones(rewards + [value], dones + [0],
                                           self.gamma)[:-1]
         else:
             rewards = discount_with_dones(rewards, dones, self.gamma)
         mb_rewards[n] = rewards
     mb_rewards = mb_rewards.flatten()
     mb_actions = mb_actions.flatten()
     mb_values = mb_values.flatten()
     mb_masks = mb_masks.flatten()
     return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
Exemplo n.º 5
0
Arquivo: a2c.py Projeto: Jventajas/A2C
def collect_rollout(env, agent, initial_obs, max_t, gamma=0.99):
    roll_obs, roll_rew, roll_act, roll_vals, roll_dones = [], [], [], [], []
    obs = initial_obs
    dones = [False for _ in range(env.num_env)]

    for n in range(max_t):
        actions, values = agent(obs)
        roll_obs.append(obs)
        roll_act.append(actions)
        roll_vals.append(values)
        roll_dones.append(dones)
        obs, rewards, dones, _ = env.step(actions)
        roll_rew.append(rewards)

    roll_obs = np.asarray(roll_obs, dtype=np.float32).swapaxes(1, 0).reshape((env.num_env * max_t,) + env.obs_shape)
    roll_rew = np.asarray(roll_rew, dtype=np.float32).swapaxes(1, 0)
    roll_act = np.asarray(roll_act, dtype=np.int32).swapaxes(1, 0)
    roll_vals = np.asarray(roll_vals, dtype=np.float32).swapaxes(1, 0)
    _, last_values = agent(obs)

    for n, (rewards, ep_dones, value) in enumerate(zip(roll_rew, roll_dones, last_values)):
        rewards = rewards.tolist()
        ep_dones = ep_dones.tolist()
        if ep_dones[-1] == 0:

            rewards = discount_with_dones(rewards + [value], ep_dones + [0], gamma)[:-1]
        else:
            rewards = discount_with_dones(rewards, ep_dones, gamma)
        roll_rew[n] = rewards


    roll_rew = roll_rew.flatten()
    roll_act = roll_act.flatten()
    roll_vals = roll_vals.flatten()


    return obs, roll_obs, roll_rew, roll_act, roll_vals
 def learn(self):
     if not self.args.no_sil:
         sil_model = sil_module(self.net, self.args, self.optimizer)
     num_updates = self.args.total_frames // (self.args.num_processes *
                                              self.args.nsteps)
     # get the reward to calculate other information
     episode_rewards = torch.zeros([self.args.num_processes, 1])
     final_rewards = torch.zeros([self.args.num_processes, 1])
     # start to update
     for update in range(num_updates):
         mb_obs, mb_rewards, mb_actions, mb_dones = [], [], [], []
         for step in range(self.args.nsteps):
             with torch.no_grad():
                 input_tensor = self._get_tensors(self.obs)
                 _, pi = self.net(input_tensor)
             # select actions
             actions = select_actions(pi)
             cpu_actions = actions.squeeze(1).cpu().numpy()
             # start to store the information
             mb_obs.append(np.copy(self.obs))
             mb_actions.append(cpu_actions)
             mb_dones.append(self.dones)
             # step
             obs, rewards, dones, _ = self.envs.step(cpu_actions)
             # process rewards...
             raw_rewards = copy.deepcopy(rewards)
             rewards = np.sign(rewards)
             # start to store the rewards
             self.dones = dones
             if not self.args.no_sil:
                 sil_model.step(input_tensor.detach().cpu().numpy(),
                                cpu_actions, raw_rewards, dones)
             mb_rewards.append(rewards)
             for n, done in enumerate(dones):
                 if done:
                     self.obs[n] = self.obs[n] * 0
             self.obs = obs
             raw_rewards = torch.from_numpy(
                 np.expand_dims(np.stack(raw_rewards), 1)).float()
             episode_rewards += raw_rewards
             # get the masks
             masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                        for done_ in dones])
             final_rewards *= masks
             final_rewards += (1 - masks) * episode_rewards
             episode_rewards *= masks
             # update the obs
         mb_dones.append(self.dones)
         # process the rollouts
         mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
             self.batch_ob_shape)
         mb_rewards = np.asarray(mb_rewards,
                                 dtype=np.float32).swapaxes(1, 0)
         mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
         mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
         mb_masks = mb_dones[:, :-1]
         mb_dones = mb_dones[:, 1:]
         with torch.no_grad():
             input_tensor = self._get_tensors(self.obs)
             last_values, _ = self.net(input_tensor)
         # compute returns
         for n, (rewards, dones, value) in enumerate(
                 zip(mb_rewards, mb_dones,
                     last_values.detach().cpu().numpy().squeeze())):
             rewards = rewards.tolist()
             dones = dones.tolist()
             if dones[-1] == 0:
                 rewards = discount_with_dones(rewards + [value],
                                               dones + [0],
                                               self.args.gamma)[:-1]
             else:
                 rewards = discount_with_dones(rewards, dones,
                                               self.args.gamma)
             mb_rewards[n] = rewards
         mb_rewards = mb_rewards.flatten()
         mb_actions = mb_actions.flatten()
         # start to update network
         vl, al, ent = self._update_network(mb_obs, mb_rewards, mb_actions)
         # start to update the sil_module
         if not self.args.no_sil:
             mean_adv, num_samples = sil_model.train_sil_model()
         if update % self.args.log_interval == 0:
             if not self.args.no_sil:
                 print('[{}] Update: {}/{}, Frames: {}, Rewards: {:.2f}, VL: {:.3f}, PL: {:.3f},' \
                         'Ent: {:.2f}, Min: {}, Max:{}, BR:{}, E:{}, VS:{}, S:{}'.format(\
                         datetime.now(), update, num_updates, (update+1)*(self.args.num_processes * self.args.nsteps),\
                         final_rewards.mean(), vl, al, ent, final_rewards.min(), final_rewards.max(), sil_model.get_best_reward(), \
                         sil_model.num_episodes(), num_samples, sil_model.num_steps()))
             else:
                 print('[{}] Update: {}/{}, Frames: {}, Rewards: {:.2f}, VL: {:.3f}, PL: {:.3f},' \
                         'Ent: {:.2f}, Min: {}, Max:{}'.format(\
                         datetime.now(), update, num_updates, (update+1)*(self.args.num_processes * self.args.nsteps),\
                         final_rewards.mean(), vl, al, ent, final_rewards.min(), final_rewards.max()))
             torch.save(self.net.state_dict(), self.model_path + 'model.pt')
Exemplo n.º 7
0
    def run(self):
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_rawrewards = [],[],[],[],[],[]
        mb_states = self.states
        for n in range(self.nsteps):
            actions, pi, values, states, _ = self.model.step(
                self.obs, self.states)  # , self.dones) ?
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)
            obs, rewards, dones, _ = self.env.step(actions)
            obs = normalize_obs(obs)
            self.logger.debug('Observations: %s' % obs)

            # render only every i-th episode
            if self.show_interval != 0:
                if (self.ep_idx[0] % self.show_interval) == 0:
                    self.env.render()

            self.eplength = [
                self.eplength[i] + 1 for i in range(self.nenv)
            ]  # Todo use already implemented functions in run_ple_utils!!!
            self.epreturn = [
                self.epreturn[i] + rewards[i] for i in range(self.nenv)
            ]
            [
                self.reward_window[i].append(rewards[i])
                for i in range(self.nenv)
            ]

            # Check for terminal states in every env
            for i, done in enumerate(dones):  # i -> environment ID
                if done:
                    self.ep_idx[i] += 1
                    self.obs[i] = self.obs[i] * 0

                    # update tensorboard summary
                    if self.summary_writer is not None:
                        summary = tf.Summary()
                        summary.value.add(
                            tag='envs/environment%s/episode_length' % i,
                            simple_value=self.eplength[i])
                        summary.value.add(
                            tag='envs/environment%s/episode_reward' % i,
                            simple_value=self.epreturn[i])
                        self.summary_writer.add_summary(
                            summary, self.ep_idx[i])  #self.global_step.eval())
                        self.summary_writer.flush()
                    # self.retbuffer.append(self.epreturn[i])
                    if self.epreturn[i] > self.return_threshold:
                        self.return_threshold = self.epreturn[i]
                        self.logger.info('Save model at max reward %s' %
                                         self.return_threshold)
                        self.model.save('inter_model')
                    self.eplength[i] = 0
                    self.epreturn[i] = 0

            # # Is not necessary, as the environment is continuous now!
            # # Reset RNN state vector to 0 if previous sample is a terminating one.
            # # As no history should be used in rnn training then.
            # if states:
            #     env_was_done = False
            #     for i, done in enumerate(self.dones):
            #         if done and not env_was_done:
            #             env_was_done = True
            #             c_new = states[0]
            #             h_new = states[1]
            #             c_new[i] = np.zeros_like(c_new[i])
            #             h_new[i] = np.zeros_like(h_new[i])
            #         elif done:
            #             c_new[i] = np.zeros_like(c_new[i])
            #             h_new[i] = np.zeros_like(h_new[i])
            #     if env_was_done:
            #         states = tf.contrib.rnn.LSTMStateTuple(c_new, h_new)
            #         # print(states)

            self.states = states
            self.dones = dones
            self.obs = obs
            mb_rewards.append(rewards)
        mb_dones.append(self.dones)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape(
            self.batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        # mb_masks = mb_dones[:, :-1] ?
        mb_rawrewards = np.copy(mb_rewards)
        mb_dones = mb_dones[:, 1:]
        last_values = self.model.value(self.obs, self.states).tolist()

        # discount/bootstrap off value fn
        for n, (rewards, dones,
                value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist()
            dones = dones.tolist()
            if dones[-1] == 0:
                rewards = discount_with_dones(rewards + [value], dones + [0],
                                              self.gamma)[:-1]
            else:
                rewards = discount_with_dones(rewards, dones, self.gamma)
            self.logger.debug('Discounted rewards: %s' % rewards)
            mb_rewards[n] = rewards
        mb_rewards = mb_rewards.flatten()
        mb_actions = mb_actions.flatten()
        mb_values = mb_values.flatten()

        self.logger.debug('Actions: %s' % mb_actions)
        self.logger.debug('Q values: %s' % mb_values)
        self.logger.debug('Observations: %s' % mb_obs)

        return mb_obs, mb_states, mb_rewards, mb_actions, mb_values, self.reward_window, mb_rawrewards  # self.avg_return_n_episodes
Exemplo n.º 8
0
 def learn(self):
     num_updates = self.args.total_frames // (self.args.num_workers * self.args.nsteps)
     # get the reward to calculate other information
     episode_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32)
     final_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32)
     # start to update
     for update in range(num_updates):
         if self.args.lr_decay:
             self._adjust_learning_rate(update, num_updates)
         mb_obs, mb_rewards_ex, mb_actions, mb_dones, mb_obs_, mb_v_ex, mb_v_mix = [], [], [], [], [], [], []
         for step in range(self.args.nsteps):
             with torch.no_grad():
                 input_tensor = self._get_tensors(self.obs)
                 v_mix, pi = self.net(input_tensor)
                 _, v_ex = self.intrinsic_net(input_tensor)
             # select actions
             actions = select_actions(pi)
             cpu_actions = actions.squeeze(1).cpu().numpy()
             # start to store the information
             mb_obs.append(np.copy(self.obs))
             mb_actions.append(cpu_actions)
             mb_dones.append(self.dones)
             mb_v_ex.append(v_ex.detach().cpu().numpy().squeeze())
             mb_v_mix.append(v_mix.detach().cpu().numpy().squeeze())
             # step
             obs_, rewards, dones, _ = self.envs.step(cpu_actions)
             # store the observation next
             mb_obs_.append(np.copy(obs_))
             # start to store the rewards
             self.dones = dones
             mb_rewards_ex.append(rewards)
             for n, done in enumerate(dones):
                 if done:
                     self.obs[n] = self.obs[n]*0
             self.obs = obs_
             episode_rewards += rewards
             # get the masks
             masks = np.array([0.0 if done else 1.0 for done in dones], dtype=np.float32)
             final_rewards *= masks
             final_rewards += (1 - masks) * episode_rewards
             episode_rewards *= masks
             # update the obs
         mb_dones.append(self.dones)
         # process the rollouts
         mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
         mb_obs_ = np.asarray(mb_obs_, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
         """
         mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0)
         mb_obs_ = np.asarray(mb_obs_, dtype=np.uint8).swapaxes(1, 0)
         """
         # calculate the intrinsic rewards and make sure the dimensional is right
         mb_rewards_in = self._compute_intrinsic_rewards(mb_obs, mb_obs_)
         mb_rewards_in = mb_rewards_in.reshape((self.args.num_workers, self.args.nsteps))
         # --- next
         mb_rewards_ex = np.asarray(mb_rewards_ex, dtype=np.float32).swapaxes(1, 0)
         # calculate the mix reward
         mb_rewards_mix = self.args.r_ext_coef * mb_rewards_ex + self.args.r_in_coef * mb_rewards_in
         mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
         mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
         mb_v_ex = np.asarray(mb_v_ex, dtype=np.float32).swapaxes(1, 0)
         mb_v_mix = np.asarray(mb_v_mix, dtype=np.float32).swapaxes(1, 0)
         # masks
         mb_masks = mb_dones[:, :-1]
         mb_dones = mb_dones[:, 1:]
         # calculate the last value
         with torch.no_grad():
             input_tensor = self._get_tensors(self.obs)
             last_values_mix, _ = self.net(input_tensor)
             last_values_mix = last_values_mix.detach().cpu().numpy().squeeze()
             # then last value ex
             _, last_values_ex = self.intrinsic_net(input_tensor)
             last_values_ex = last_values_ex.detach().cpu().numpy().squeeze()
         # get the returns ex and in
         mb_returns_ex, mb_returns_mix = np.zeros(mb_rewards_ex.shape), np.zeros(mb_rewards_in.shape)
         # compute returns
         for n, (rewards_ex, rewards_mix, dones, value_mix, value_ex) in enumerate(zip(mb_rewards_ex, mb_rewards_mix, mb_dones, last_values_mix, last_values_ex)):
             rewards_ex = rewards_ex.tolist()
             rewards_mix = rewards_mix.tolist()
             dones = dones.tolist()
             if dones[-1] == 0:
                 returns_ex = discount_with_dones(rewards_ex+[value_ex], dones+[0], self.args.gamma)[:-1]
                 returns_mix = discount_with_dones(rewards_mix+[value_mix], dones+[0], self.args.gamma)[:-1]
             else:
                 returns_ex = discount_with_dones(rewards_ex, dones, self.args.gamma)
                 returns_mix = discount_with_dones(rewards_mix, dones, self.args.gamma)
             mb_returns_ex[n] = returns_ex
             mb_returns_mix[n] = returns_mix
         # flatten stuffs
         mb_rewards_ex = mb_rewards_ex.flatten()
         mb_rewards_in = mb_rewards_in.flatten()
         mb_returns_ex = mb_returns_ex.flatten()
         mb_returns_mix = mb_returns_mix.flatten()
         mb_actions = mb_actions.flatten()
         mb_v_ex = mb_v_ex.flatten()
         mb_v_mix = mb_v_mix.flatten()
         mb_dones = mb_dones.flatten()
         mb_masks = mb_masks.flatten()
         # before the training calculate the matrix
         """
         here - we calculate the coefficient matrix
         """
         dis_v_mix_last = np.zeros([mb_obs.shape[0]], np.float32)
         coef_mat = np.zeros([mb_obs.shape[0], mb_obs.shape[0]], np.float32)
         for i in range(mb_obs.shape[0]):
             dis_v_mix_last[i] = self.args.gamma ** (self.args.nsteps - i % self.args.nsteps) * last_values_mix[i // self.args.nsteps]
             coef = 1.0
             for j in range(i, mb_obs.shape[0]):
                 if j > i and j % self.args.nsteps == 0:
                     break
                 coef_mat[i][j] = coef
                 coef *= self.args.gamma
                 if mb_dones[j]:
                     dis_v_mix_last[i] = 0
                     break
         # start to update network
         vl, al, ent = self._update_network(mb_obs, mb_obs_, mb_masks, mb_actions, mb_rewards_ex, mb_returns_ex, mb_v_ex, mb_v_mix, \
                                             dis_v_mix_last, coef_mat)
         if update % self.args.log_interval == 0:
             print('[{}] Update: {}/{}, Frames: {}, Rewards: {:.1f}, VL: {:.3f}, PL: {:.3f}, Ent: {:.2f}, Min: {}, Max:{}, R_in: {:.3f}'.format(\
                 datetime.now(), update, num_updates, (update+1)*(self.args.num_workers * self.args.nsteps),\
                 final_rewards.mean(), vl, al, ent, final_rewards.min(), final_rewards.max(), np.mean(mb_rewards_in)))
             torch.save(self.net.state_dict(), self.model_path + 'model.pt')
Exemplo n.º 9
0
    def run(self):
        mb_obs, mb_rewards, mb_rewards_square, mb_actions, mb_values, mb_moments, mb_dones = [],[],[],[],[],[],[]
        mb_states = self.states
        epinfos = []
        for n in range(self.nsteps):
            actions, values, moments, states, _ = self.model.step(
                self.obs, self.states, self.dones)
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_moments.append(moments)
            mb_dones.append(self.dones)
            obs, rewards, dones, infos = self.env.step(actions)
            for info in infos:
                maybeepinfo = info.get('episode')
                if maybeepinfo: epinfos.append(maybeepinfo)
            self.states = states
            self.dones = dones
            for n, done in enumerate(dones):
                if done:
                    self.obs[n] = self.obs[n] * 0
                    self.counters_fixed.append(self.counters[n])
                    self.counters[n] = 0
                else:
                    self.counters[n] += rewards[n]
            self.obs = obs
            rewards = np.sign(rewards)
            mb_rewards.append(rewards)
            mb_rewards_square.append(rewards)  # MAYBE ERRRRORR
        mb_dones.append(self.dones)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
            self.batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_rewards_square = np.asarray(mb_rewards_square,
                                       dtype=np.float32).swapaxes(1, 0)

        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_moments = np.asarray(mb_moments, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]
        values_temp, moments_temp = self.model.value(self.obs, self.states,
                                                     self.dones)
        last_values = values_temp.tolist()
        last_moments = moments_temp.tolist()
        #discount/bootstrap off value fn
        for n, (rewards, dones, value, moment) in enumerate(
                zip(mb_rewards, mb_dones, last_values, last_moments)):
            rewards_square = rewards.copy()
            rewards_square = rewards_square.tolist()
            rewards = rewards.tolist()
            dones = dones.tolist()
            if dones[-1] == 0:
                sigma = np.sqrt(np.maximum(moment - value**2, 0))
                prob_value = np.random.normal(loc=value, scale=sigma)
                # prob_value = value + sigma
                rewards = discount_with_dones(rewards + [prob_value],
                                              dones + [0], self.gamma)[:-1]
                rewards_square = discount_moments_with_dones(rewards_square,
                                                             dones,
                                                             self.gamma,
                                                             flag=True,
                                                             value=value,
                                                             moment=moment)
            else:
                rewards = discount_with_dones(rewards, dones, self.gamma)
                rewards_square = discount_moments_with_dones(
                    rewards_square, dones, self.gamma)

            mb_rewards[n] = rewards
            mb_rewards_square[n] = rewards_square
        mb_rewards = mb_rewards.flatten()
        mb_rewards_square = mb_rewards_square.flatten()
        mb_actions = mb_actions.flatten()
        mb_values = mb_values.flatten()
        mb_moments = mb_moments.flatten()
        mb_masks = mb_masks.flatten()
        return mb_obs, mb_states, mb_rewards, mb_rewards_square, mb_masks, mb_actions, mb_values, mb_moments, epinfos
Exemplo n.º 10
0
    def run(self):
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], []
        mb_actions_n = []
        mb_raw_rewards = []
        mb_states = self.states
        int_reward = np.zeros(self.nenvs)
        if self.first_flag == 1:
            self.actions, self.values, self.act_pred, _, _ = self.model.step(
                self.obs, self.states, self.dones)
            self.first_flag = 0
            self.last_actpred = np.copy(self.act_pred)
            print('first_flag:', self.first_flag)

        for n in range(self.nsteps):
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(self.actions)
            mb_values.append(self.values)
            mb_dones.append(self.dones)
            obs_all, raw_rewards, dones, _ = self.env.step(self.actions)
            obs = [obs_index['image'] for obs_index in obs_all]
            obs = np.asarray(obs)

            rewards = np.array(raw_rewards, dtype=np.float32)
            self.dones = dones
            # add by lilijuan at 2018.9.25
            last_obs = np.copy(self.obs)
            last_actions = np.copy(self.actions)

            for i in range(self.nenvs):
                int_reward[i] = self.actions[i] != self.last_actpred[i]
            int_reward = np.array(int_reward * 0.001, dtype=np.float32)
            rewards += int_reward
            self.last_actpred = np.copy(self.act_pred)
            for n, done in enumerate(dones):
                if done:
                    self.obs[n] = self.obs[n] * 0
            self.obs = obs
            mb_rewards.append(rewards)
            mb_raw_rewards.append(raw_rewards)
            self.actions, self.values, self.act_pred, _, _ = self.model.step(
                self.obs, self.states, self.dones)
            mb_actions_n.append(self.actions)
            # add by lilijuan at 2018.9.25 store(obs,action,action_n,reward,done)
            # self.model.sil.step(last_obs, last_actions,
            #                    self.actions, raw_rewards, dones)
            # self.model.sil.step(last_obs, last_actions, self.actions,
            #                    raw_rewards, rewards, dones)
        mb_dones.append(self.dones)
        # batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
            self.batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_raw_rewards = np.asarray(mb_raw_rewards,
                                    dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
        mb_actions_n = np.asarray(mb_actions_n, dtype=np.int32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]
        # last_values = self.model.value(
        #    self.obs, self.states, self.dones).tolist()
        last_values = self.values.tolist()
        # discount/bootstrap off value fn
        for n, (rewards, dones,
                value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist()
            dones = dones.tolist()
            if dones[-1] == 0:
                rewards = discount_with_dones(rewards + [value], dones + [0],
                                              self.gamma)[:-1]
            else:
                rewards = discount_with_dones(rewards, dones, self.gamma)
            mb_rewards[n] = rewards
        mb_rewards = mb_rewards.flatten()
        mb_raw_rewards = mb_raw_rewards.flatten()
        mb_actions = mb_actions.flatten()
        mb_actions_n = mb_actions_n.flatten()
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()
        return mb_obs, mb_states, mb_rewards, mb_raw_rewards, mb_masks, mb_actions, mb_values, mb_actions_n, int_reward
Exemplo n.º 11
0
    def run(self):
        mb_obs, mb_maps, mb_coords, mb_states, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], [], [], [], []
        for n in range(self.nsteps):
            #actions, values, states = self.model.step(self.obs, self.states, self.dones)
            actions, values, states, maps = self.model.step(self.sf.obs, maps=self.maps, coords=self.sf.coords)
            mb_obs.append(np.copy(self.sf.obs))
            mb_coords.append(np.copy(self.sf.coords))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)
            obs, rewards, dones, _ = self.env.step(actions)
            self.states = states
            self.dones = dones
            # the outer loop is per-env done,
            # the inner loop is per-player done
            for i, done in enumerate(dones):
                if done[0]:
                    self.sf.obs[i, :] = self.sf.obs[i, :]*0
                    self.sf.coords[i, :] = self.sf.coords[i, :]*0
                    if self.maps != []:
                        self.maps[i] = self.maps[i]*0
            self.sf.update_obs(obs)
            mb_states.append(np.copy(self.states))
            mb_maps.append(np.copy(maps))
            mb_rewards.append(rewards)
        mb_dones.append(self.dones)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
        mb_states = np.asarray(mb_states, dtype=np.float32).swapaxes(1, 0).reshape([self.nenv*self.nsteps, -1])
        mb_maps = np.asarray(mb_maps, dtype=np.float32).swapaxes(1, 0).reshape([self.nenv*self.nsteps] + self.map_size)
        mb_coords = np.asarray(mb_coords, dtype=np.uint8).swapaxes(1, 0).reshape([self.nenv*self.nsteps, -1, 2])
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0).swapaxes(2, 1)
        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0).swapaxes(2, 1)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0).swapaxes(2, 1)
        mb_masks = mb_dones[:, :, :-1]
        mb_dones = mb_dones[:, :, 1:]
        last_values = self.model.value(self.sf.obs, maps=self.maps, coords=self.sf.coords)
        #discount/bootstrap off value fn
        for i, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist()
            dones = dones.tolist()
            for j, (r, d, v) in enumerate(zip(rewards, dones, value)):
                if d[-1] == 0:
                    r = discount_with_dones(np.asarray(r+[v]), d+[0], self.gamma)[:-1]
                else:
                    r = discount_with_dones(np.asarray(r), d, self.gamma)
                mb_rewards[i, j] = r
            """
            if dones[-1] == 0:
                rewards = discount_with_dones(np.asarray(rewards+[value]), dones+[0], self.gamma)[:-1]
            else:
                rewards = discount_with_dones(np.asarray(rewards), dones, self.gamma)
            mb_rewards[n] = rewards
            """

        def _flatten(arr):
            return arr.reshape(-1)

        mb_rewards = _flatten(mb_rewards.swapaxes(2,1))
        mb_values = _flatten(mb_values.swapaxes(2,1))
        mb_masks = mb_masks.swapaxes(2,1)
        mb_masks = mb_masks.reshape(mb_masks.shape[0]*mb_masks.shape[1], -1)
        mb_actions = _flatten(mb_actions)

        return mb_obs, mb_maps, mb_coords, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
Exemplo n.º 12
0
 def learn(self):
     num_updates = self.args.total_frames // (self.args.num_workers *
                                              self.args.nsteps)
     # get the reward to calculate other information
     episode_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32)
     final_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32)
     # start to update
     for update in range(num_updates):
         mb_obs, mb_rewards, mb_actions, mb_dones = [], [], [], []
         for step in range(self.args.nsteps):
             with torch.no_grad():
                 input_tensor = self._get_tensors(self.obs)
                 _, pi = self.net(input_tensor)
             # select actions
             actions = select_actions(pi)
             cpu_actions = actions.squeeze(1).cpu().numpy()
             # start to store the information
             mb_obs.append(np.copy(self.obs))
             mb_actions.append(cpu_actions)
             mb_dones.append(self.dones)
             # step
             obs, rewards, dones, _ = self.envs.step(cpu_actions)
             # start to store the rewards
             self.dones = dones
             mb_rewards.append(rewards)
             for n, done in enumerate(dones):
                 if done:
                     self.obs[n] = self.obs[n] * 0
             self.obs = obs
             episode_rewards += rewards
             # get the masks
             masks = np.array([0.0 if done else 1.0 for done in dones],
                              dtype=np.float32)
             final_rewards *= masks
             final_rewards += (1 - masks) * episode_rewards
             episode_rewards *= masks
             # update the obs
         mb_dones.append(self.dones)
         # process the rollouts
         mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
             self.batch_ob_shape)
         mb_rewards = np.asarray(mb_rewards,
                                 dtype=np.float32).swapaxes(1, 0)
         mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
         mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
         mb_masks = mb_dones[:, :-1]
         mb_dones = mb_dones[:, 1:]
         # calculate the last value
         with torch.no_grad():
             input_tensor = self._get_tensors(self.obs)
             last_values, _ = self.net(input_tensor)
         # compute returns
         for n, (rewards, dones, value) in enumerate(
                 zip(mb_rewards, mb_dones,
                     last_values.detach().cpu().numpy().squeeze())):
             rewards = rewards.tolist()
             dones = dones.tolist()
             if dones[-1] == 0:
                 rewards = discount_with_dones(rewards + [value],
                                               dones + [0],
                                               self.args.gamma)[:-1]
             else:
                 rewards = discount_with_dones(rewards, dones,
                                               self.args.gamma)
             mb_rewards[n] = rewards
         mb_rewards = mb_rewards.flatten()
         mb_actions = mb_actions.flatten()
         # start to update network
         vl, al, ent = self._update_network(mb_obs, mb_rewards, mb_actions)
         if update % self.args.log_interval == 0:
             print('[{}] Update: {}/{}, Frames: {}, Rewards: {:.1f}, VL: {:.3f}, PL: {:.3f}, Ent: {:.2f}, Min: {}, Max:{}'.format(\
                 datetime.now(), update, num_updates, (update+1)*(self.args.num_workers * self.args.nsteps),\
                 final_rewards.mean(), vl, al, ent, final_rewards.min(), final_rewards.max()))
             torch.save(self.net.state_dict(), self.model_path + 'model.pt')