예제 #1
0
    def __init__(self,
                 env,
                 mem_size=7 * int(1e3),
                 lr_critic=1e-3,
                 lr_actor=1e-4,
                 epsilon=1.,
                 max_epi=1500,
                 epsilon_decay=1. / (1e5),
                 gamma=.99,
                 target_update_frequency=200,
                 batch_size=64,
                 random_process=True,
                 max_step=None):
        self.CUDA = torch.cuda.is_available()

        self.orig_env = env  #for recording
        if max_step is not None:
            self.orig_env._max_episode_steps = max_step
        self.env = self.orig_env
        self.N_S = self.env.observation_space.shape[0]
        self.N_A = self.env.action_space.shape[0]
        self.MAX_EPI = max_epi
        self.LOW = self.env.action_space.low
        self.HIGH = self.env.action_space.high

        self.actor = Actor(self.N_S, self.N_A)
        self.critic = Critic(self.N_S, self.N_A)
        self.target_actor = Actor(self.N_S, self.N_A)
        self.target_critic = Critic(self.N_S, self.N_A)
        self.target_actor.eval()
        self.target_critic.eval()
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())
        if self.CUDA:
            self.actor.cuda()
            self.critic.cuda()
            self.target_actor.cuda()
            self.target_critic.cuda()

        self.exp = Experience(mem_size)
        self.optim_critic = optim.Adam(self.critic.parameters(), lr=lr_critic)
        self.optim_actor = optim.Adam(self.actor.parameters(), lr=-lr_actor)
        self.random_process = OrnsteinUhlenbeckProcess(\
                size=self.N_A, theta=.15, mu=0, sigma=.2)
        self.EPSILON = epsilon
        self.EPSILON_DECAY = epsilon_decay
        self.GAMMA = gamma
        self.TARGET_UPDATE_FREQUENCY = target_update_frequency
        self.BATCH_SIZE = batch_size

        title = {common.S_EPI: [], common.S_TOTAL_R: []}
        self.data = pd.DataFrame(title)
        self.RAND_PROC = random_process
예제 #2
0
    def update_policy(self, striker_memory, goalie_memory):
        # do not train until exploration is enough
        if self.episode_done <= self.episodes_before_train:
            return None, None

        ByteTensor = torch.cuda.ByteTensor if self.use_cuda else torch.ByteTensor
        FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor

        c_loss = []
        a_loss = []
        for agent_index in range(self.n_striker):
            s_transitions = striker_memory.sample(self.batchSize)
            g_transitions = goalie_memory.sample(self.batchSize)
            s_batch = Experience(*zip(*s_stransitions))
            g_batch = Experience(*zip(*s_stransitions))
            
            s_non_final_mask = ByteTensor(list(map(lambda s: s is not None,
                                                    s_batch.next_states)))
            g_non_final_mask = ByteTensor(list(map(lambda s: s is not None,
                                                    g_batch.next_states)))
            
            # state_batch: batch_size x n_agents x dim_obs
            s_state_batch = torch.stack(s_batch.states).type(FloatTensor)
            s_action_batch = torch.stack(s_batch.actions).type(FloatTensor)
            s_reward_batch = torch.stack(s_batch.rewards).type(FloatTensor)
            g_state_batch = torch.stack(g_batch.states).type(FloatTensor)
            g_action_batch = torch.stack(g_batch.actions).type(FloatTensor)
            g_reward_batch = torch.stack(g_batch.rewards).type(FloatTensor)
            
            # : (batch_size_non_final) x n_agents x dim_obs
            s_non_final_next_states = torch.stack(
                [s for s in s_batch.next_states
                 if s is not None]).type(FloatTensor)
            g_non_final_next_states = torch.stack(
                [s for s in g_batch.next_states
                 if s is not None]).type(FloatTensor)
            
            # for current agent
            s_whole_state = s_state_batch.view(self.batchSize, -1)
            print(s_whole_state.shape)
            s_whole_action = s_action_batch.view(self.batchSize, -1)
            print(s_whole_action.shape)
            g_whole_state = g_state_batch.view(self.batchSize, -1)
            print(g_whole_state.shape)
            g_whole_action = g_action_batch.view(self.batchSize, -1)
            print(g_whole_action.shape)
            # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
            # we need a discussion to define the meaning of act_dim   #
            # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
            self.s_critic_optimizer[agent_index].zero_grad()
            self.g_critic_optimizer[agent_index].zero_grad()
예제 #3
0
    def __call__(self, env):
        result = []
        for episode in range(self.num_episodes):

            # reset at the start of episode
            current_obs = env.reset()
            next_obs = None
            episode_steps = 0
            episode_reward = 0.

            assert current_obs is not None

            # start episode
            done = False
            while not done:
                if next_obs is not None:
                    current_obs = next_obs
                # basic operation, action ,reward, blablabla ...
                action = self.policy(current_obs)
                next_obs, reward, done, info = env.step(action)

                if next_obs is None:
                    current_expr = Experience(current_obs, action, reward,
                                              next_obs, True)
                else:
                    current_expr = Experience(current_obs, action, reward,
                                              next_obs, False)

                # put data into the queue
                self.queue.put(current_expr)
                # only max steps
                if self.max_step == "None" or episode_steps >= self.max_step:
                    done = True

                if self.visualize:
                    env.render(mode='human')

                # update
                episode_reward += reward
                episode_steps += 1

            result.append(episode_reward)

        result = np.array(result).reshape(-1, 1)

        if self.save:
            self.save_results(
                result,
                os.path.join('{}'.format(self.save_path), "validate_reward"))
        return np.mean(result)
예제 #4
0
    def do_setup(self, args: Dict, observation: np.ndarray,
                 session: tf.Session):
        self.frame_size = args.frame_size
        self.stack_size = args.stack_size

        self.mem = Experience(capacity=args.mem_capacity)
        self.dqn = DQN((*args.frame_size, args.stack_size), self.max_action,
                       args.learning_rate, "ex")
        self.stacked_frames, self.state = stack_frames(
            deque([], maxlen=self.stack_size), observation, args.frame_size,
            args.stack_size)

        self.sess = session
        self.sess.run(tf.global_variables_initializer())
예제 #5
0
    def update_rule(self):
        if self.episode_done <= self.episodes_before_train:
            return None
        FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor
        transitions = self.memory.sample(self.batch_size)
        batch = Experience(*zip(*transitions))

        state_batch = th.stack(batch.states).type(FloatTensor)
        action_batch = th.stack(batch.actions).type(FloatTensor)

        whole_state = state_batch.view(self.batch_size, -1)
        whole_action = action_batch.view(self.batch_size, -1)        

        #for ag in range(self.n_agents):
        true_act,rules = self.select_rule_action(state_batch)
        
        if self.steps_done%2==0:
            id = 0
        else:
            id = 1

        Q = []

        for ag  in range(self.n_agents):
            Q.append( self.critics[ag](whole_state, Variable(th.Tensor( true_act))) )
        Qsum = sum(Q)
        if self.steps_done%600==0:
            print("true_act..",true_act[15])
            print("rule..",rules[id][15])
            print("Qsum..",Qsum[15])
        loss_r = -rules[id]*Qsum
        loss_r = loss_r.mean()
        loss_r.backward()
        self.constrain_optimizer.step()
        return loss_r
예제 #6
0
    def play(self):
        # self.env.render()
        s = self.s
        if self.step < self.learning_starts:
            a = self.env.action_space.sample()
        else:
            a = self.epsilon_greedy()
        old_lives = self.env.lives()
        SP, r, terminal, step_info = self.env.step(a)
        new_lives = self.env.lives()
        self.episode_scores += r
        sp = self.four_frames_to_4_84_84(SP)

        if new_lives < old_lives:
            print('agent died, current lives = ', new_lives)
            r = min(-1.0, r)

        if terminal and new_lives > 0:
            task_done = True
            done = 1
            r = max(1.0, r)
            print('task is solved succesfully, end of episode')
        else:
            task_done = False
            done = 0
            r = min(-0.1, r)  # just a tiny punishment

        self.episode_rewards += r

        if terminal and new_lives == 0:
            terminal = True
            print('agent terminated, end of episode')
            r = min(-10.0, r)

        r = np.clip(r, -1.0, 1.0)
        experience = Experience(s, a, r, sp, done)
        self.experience_memory.push(experience)
        self.s = copy.deepcopy(sp)

        if terminal or task_done:
            self.episode_steps_list.append(self.step)
            self.episode_scores_list.append(self.episode_scores)
            self.episode_rewards_list.append(self.episode_rewards)
            self.game_episode += 1
            self.episode_rewards = 0.0
            self.episode_scores = 0.0
            self.episode_end_time = time.time()
            episode_time = self.episode_end_time - self.episode_start_time
            self.episode_time_list.append(episode_time)
            print('episode time: {0:.2f}'.format(episode_time))
            print('-' * 60)
            print('game episode: ', self.game_episode)
            print('time step: ', self.step)
            self.episode_start_time = time.time()
            S = self.env.reset()  # reset S
            self.s = self.four_frames_to_4_84_84(S)  # get s
예제 #7
0
    def start_training(self):
        start_episode = self.training_info['episode']
        frames_passed = self.training_info['frames']
        train_frames = 1000000
        t = 0
        for episode in range(start_episode, episodes + 1):
            # Set initial state
            state = self.read_image(t)
            episode_start_time = time.time()
            while t < train_frames:
                t += 1
                random_probability = self.random_action_policy.get_probability(
                    frames_passed)
                if random.random() < random_probability:
                    action = self.random_action_policy.sample_action(
                        self.action_type)
                else:
                    # noinspection PyTypeChecker
                    action = self.action_type.from_code(
                        np.argmax(self._predict(state)))

                for _ in range(self.training_info['batches_per_frame']):
                    self._train_minibatch()
                new_state, reward = self.read_image(t)
                experience = Experience(state, action, reward, new_state)
                self.memory.append_experience(experience)

                state = new_state
                frames_passed += 1

                # Print status
                time_since_failure = time.time() - episode_start_time
                print('Episode {}, Total frames {}, ε={:.4f}, Reward {:.4f}, '
                      '{:.0f}s since failure'.format(episode, frames_passed,
                                                     random_probability,
                                                     reward,
                                                     time_since_failure),
                      end='\r')

                # Save model after a fixed amount of frames
                if frames_passed % 1000 == 0:
                    self.training_info['episode'] = episode
                    self.training_info['frames'] = frames_passed
                    self.training_info[
                        'mean_training_time'] = self.mean_training_time.get()
                    self.training_info.save()
                    self.model.save(self.MODEL_PATH)
예제 #8
0
    def predict(self):
        signal.signal(signal.SIGINT, self.stop)
        while True:
            state = self.environment.read_sensors(self.image_size,
                                                  self.image_size)[0]
            while not state.is_terminal:
                action = self.action_type.from_code(
                    np.argmax(self._predict(state)))
                self.environment.write_action(action)
                # Wait as long as we usually need to wait due to training
                time.sleep(self.training_info['batches_per_frame'] *
                           self.training_info['mean_training_time'])
                new_state, reward = self.environment.read_sensors(
                    self.image_size, self.image_size)
                experience = Experience(state, action, reward, new_state)
                self.memory.append_experience(experience)
                state = new_state

                if self.should_exit:
                    sys.exit(0)
예제 #9
0
    def update(self, step: dm_env.TimeStep, action: int,
               next_step: dm_env.TimeStep) -> None:
        """
        Adds experience to the replay memory, performs an optimization_step and updates the q_target neural network.
        Args:
            step(dm_env.TimeStep): Current observation from the environment
            action(int): The action that was performed by the agent.
            next_step(dm_env.TimeStep): Next observation from the environment
        Returns:
            None
        """

        observation = np.array(step.observation).flatten()
        next_observation = np.array(next_step.observation).flatten()
        done = next_step.last()
        exp = Experience(observation, action, next_step.reward,
                         next_step.discount, next_observation, 0, done)
        self.memory.add(exp)

        if self.memory.number_samples() < self.start_optimization:
            return

        if self.number_steps % self.update_qnet_every == 0:
            s0, a0, n_step_reward, discount, s1, _, dones, indices, weights = self.memory.sample_batch(
                self.batch_size)
            if not self.distributional:
                self.optimization_step(s0, a0, n_step_reward, discount, s1,
                                       indices, weights)
            else:
                self.distributional_optimization_step(s0, a0, n_step_reward,
                                                      discount, s1, dones,
                                                      indices, weights)

        if self.number_steps % self.update_target_every == 0:
            self.q_target.load_state_dict(self.qnet.state_dict())
        return
예제 #10
0
    def update_policy(self, i_episode):
        # do not train until exploration is enough
        if self.episode_done <= self.episodes_before_train:
            return None, None

        ByteTensor = th.cuda.ByteTensor if self.use_cuda else th.ByteTensor
        FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor

        c_loss = []
        a_loss = []

        for agent in range(self.n_agents):
            transitions = self.memory.sample(self.batch_size)
            batch = Experience(*zip(*transitions))

            non_final_mask = ByteTensor(
                list(map(lambda s: s is not None, batch.next_states)))
            # state_batch: batch_size x n_agents x dim_obs
            state_batch = Variable(th.stack(batch.states).type(FloatTensor))
            #whole_list = []
            # next_whole_list = []
            # next_state_count = 0
            #print(len(batch.states) == len(batch.next_states))
            # for i in range(len(batch.states)):
            #     n_list = []
            #     for j in range(4):
            #         for k in range(len(batch.states[i][j])):
            #             n_list.append(batch.states[i][j][k].data.numpy())
            #             #if batch.next_states[i] != None:
            #                 #print('batch.next_states[i][j][k]',type(batch.next_states[i][j][k]),i,j,k)
            #                 # next_state_count += 1
            #     n_array = np.asarray(n_list)
            #     # print('n_array',type(n_array))
            #     n_tensor = th.from_numpy(n_array).float()
            #     n_variable = Variable(n_tensor).type(FloatTensor)
            #     whole_list.append(n_variable.data.numpy())
            # whole_array = np.asarray(whole_list)
            # whole_tensor = th.from_numpy(whole_array).float()

            # for i in range(len(batch.states)):
            #     next_list = []
            #     if batch.next_states[i] != None:
            #         for j in range(4):
            #             for k in range(len(batch.next_states[i][j])):
            #                 #print('batch.next_states[i][j][k]',batch.next_states[i][j][k],i,j,k)
            #                 next_list.append(batch.next_states[i][j][k].data.numpy())
            #         next_array = np.asarray(next_list)
            #         next_tensor = th.from_numpy(next_array).float()
            #         next_variable = Variable(next_tensor).type(FloatTensor)
            #         next_whole_list.append(th.t(next_variable).data.numpy())
            # next_whole_array = np.asarray(next_whole_list)
            # next_whole_tensor = th.from_numpy(next_whole_array).float()

            #state_batch = Variable(th.stack(whole_tensor).type(FloatTensor))
            # print('state_batch',state_batch)  #[torch.FloatTensor of size 100x62x1]
            # state_batch = Variable(th.stack(batch.states).type(FloatTensor))
            action_batch = Variable(th.stack(batch.actions).type(FloatTensor))
            reward_batch = Variable(th.stack(batch.rewards).type(FloatTensor))
            # : (batch_size_non_final) x n_agents x dim_obs
            # print('next_whole_tensor',next_whole_tensor)
            #non_final_next_states = Variable(th.stack(next_whole_tensor).type(FloatTensor))
            non_final_next_states = Variable(
                th.stack([s for s in batch.next_states
                          if s is not None]).type(FloatTensor))
            # print('non_final_next_states',non_final_next_states)  [torch.FloatTensor of size 99x1x62]
            # for current agent
            whole_state = state_batch.view(self.batch_size, -1)
            # print('whole_state',whole_state)  [torch.FloatTensor of size 100x62]
            whole_action = action_batch.view(self.batch_size, -1)
            # non_final_next_states = non_final_next_states.view(next_state_count,-1)
            # print('non_final_next_states',non_final_next_states)
            self.critic_optimizer[agent].zero_grad()

            current_Q = self.critics[agent](whole_state, whole_action)

            # non_final_next_actions = []

            # for a in range(self.n_agents):
            #     batch_obs = []
            #     # for j in range(self.n_agents):
            #     for i in range(len(batch.next_states)):
            #         if batch.next_states[i] is not None:
            #             batch_obs.append(batch.next_states[i][a].data.numpy())
            #             # print('batch_obs',type(batch.next_states[i][a]))  'torch.autograd.variable.Variable'
            #     batch_obs = np.asarray(batch_obs)
            #     batch_obs = th.from_numpy(batch_obs).float()
            #     batch_obs = Variable(batch_obs).type(FloatTensor)
            #     # print('batch_obs',batch_obs)  [torch.FloatTensor of size 99x16]
            #     non_final_next_actions.append(self.actors_target[a](batch_obs))
            # print('non_final_next_actions',non_final_next_actions)

            non_final_next_actions = [  #[torch.FloatTensor of size 989x2]
                self.actors_target[i](
                    non_final_next_states[:,  #[torch.FloatTensor of size 989x213]
                                          i, :]) for i in range(self.n_agents)
            ]
            non_final_next_actions = th.stack(non_final_next_actions)
            # non_final_next_actions = Variable(non_final_next_actions)
            non_final_next_actions = (non_final_next_actions.transpose(
                0, 1).contiguous())
            target_Q = Variable(th.zeros(self.batch_size).type(FloatTensor))
            # print('non_final_mask',non_final_mask)
            target_Q[non_final_mask] = self.critics_target[agent](
                non_final_next_states.view(-1, self.n_agents * self.n_states),
                non_final_next_actions.view(-1,
                                            self.n_agents * self.n_actions))

            # scale_reward: to scale reward in Q functions
            target_Q = (target_Q * self.GAMMA) + (reward_batch[:, agent] *
                                                  scale_reward)

            loss_Q = nn.MSELoss()(current_Q, target_Q.detach())
            loss_Q.backward()
            self.critic_optimizer[agent].step()

            self.actor_optimizer[agent].zero_grad()
            state_i = state_batch[:, agent, :]
            # state_i = []
            # for i in range(len(state_batch)):
            #     state_i.append(batch.states[i][agent].data.numpy())
            # print('batch_obs',type(batch.next_states[i][a]))  'torch.autograd.variable.Variable'
            #state_i = np.asarray(state_i)
            #state_i = th.from_numpy(state_i).float()
            #state_i = Variable(state_i).type(FloatTensor)
            # print('state_i',state_i)  [torch.FloatTensor of size 100x1]
            action_i = self.actors[agent](state_i)
            ac = action_batch.clone()
            ac[:, agent, :] = action_i
            whole_action = ac.view(self.batch_size, -1)
            actor_loss = -self.critics[agent](whole_state, whole_action)
            actor_loss = actor_loss.mean()
            actor_loss.backward()
            self.actor_optimizer[agent].step()
            c_loss.append(loss_Q)
            a_loss.append(actor_loss)

        if self.steps_done % 100 == 0 and self.steps_done > 0:
            for i in range(self.n_agents):
                soft_update(self.critics_target[i], self.critics[i], self.tau)
                soft_update(self.actors_target[i], self.actors[i], self.tau)
        if i_episode % 100 == 0:
            for i in range(self.n_agents):
                th.save(self.critics[i],
                        'critic[' + str(i) + '].pkl_episode' + str(i_episode))
                th.save(self.actors[i],
                        'actors[' + str(i) + '].pkl_episode' + str(i_episode))
        return c_loss, a_loss
예제 #11
0
    def update_policy(self, memory):

        # momory format is memory.push(prev_state, states, [prev_action_striker, prev_action_goalie], prev_reward)
        # do not train until exploration is enough
        # if self.episode_done <= self.episode_before_training:
        # return None, None

        c_loss = []
        a_loss = []

        if len(memory) < 1024 * 10:
            return None, None
        for agent_index in range(self.n_striker):

            #   #   #   #   #   #   #   #   #   #   #   #   #   #   #   #   #   #   #   #   #   #   #
            # batch sample is batch * N play ground * agents * state/next_state/action/reward/      #
            #   #   #   #   #   #   #   #   #   #   #   #   #   #   #   #   #   #   #   #   #   #   #

            transitions = memory.sample(self.batchSize_d2)
            batch = Experience(*zip(*transitions))

            batch_state = np.asarray(batch.states)
            batch_action = np.asarray(batch.actions)
            batch_reward = np.asarray(batch.rewards)
            batch_reward = torch.from_numpy(batch_reward).to(
                self.device).float()
            batch_next_state = np.asarray(batch.next_state)

            state_batch = torch.from_numpy(batch_state)
            action_batch = torch.from_numpy(batch_action)
            next_state_batch = torch.from_numpy(batch_next_state)
            #   #   #   #   #   #   #   #   #
            # total numbers of data = batchsize * play ground   #
            #   #   #   #   #   #   #   #   #
            total_numbers_of_data = batch_state.shape[0] * batch_state.shape[1]

            whole_state = state_batch.view(total_numbers_of_data,
                                           -1).to(self.device).float()
            whole_action = action_batch.view(total_numbers_of_data * 4,
                                             -1).long()

            #   #   #   #
            # translate action into one hot #
            #   #   #   #
            one_hot = (whole_action == torch.arange(7).reshape(1, 7)).float()
            one_hot = one_hot.view(total_numbers_of_data, -1).to(self.device)

            self.critic_optimizer[0].zero_grad()
            self.critic_optimizer[1].zero_grad()

            s_current_Q = self.critic[0](whole_state, one_hot)
            g_current_Q = self.critic[1](whole_state, one_hot)
            s_whole_next_state = next_state_batch[:, :, 0:2, :].to(
                self.device).float()
            s_whole_next_state = s_whole_next_state.view(
                total_numbers_of_data * 2, -1)

            g_whole_next_state = next_state_batch[:, :, 2:4, :].to(
                self.device).float()
            g_whole_next_state = g_whole_next_state.view(
                total_numbers_of_data * 2, -1)
            #   #   #
            # Next_actions  #
            #   #   #
            s_next_actions = [self.s_actors_target[0](s_whole_next_state)]
            g_next_actions = [self.g_actors_target[0](g_whole_next_state)]
            s_next_actions = torch.stack(s_next_actions)
            g_next_actions = torch.stack(g_next_actions)
            s_next_actions = (s_next_actions.transpose(0, 1).contiguous())
            g_next_actions = (g_next_actions.transpose(0, 1).contiguous())

            s_next_state = s_whole_next_state.view(-1, 2, 112).to(self.device)
            g_next_state = g_whole_next_state.view(-1, 2, 112).to(self.device)
            whole_next_stat = torch.cat([s_next_state, g_next_state], dim=-2)

            s_next_actions = s_next_actions.view(-1, 14).to(self.device)
            g_next_actions = g_next_actions.view(-1, 14).to(self.device)
            whole_next_action = torch.cat([s_next_actions, g_next_actions],
                                          dim=-1)

            whole_next_stat = whole_next_stat.view(-1, 112 * 4)
            whole_next_action = whole_next_action.view(-1, 7 * 4)

            s_target_Q = self.critic_target[0](whole_next_stat,
                                               whole_next_action)
            g_target_Q = self.critic_target[1](whole_next_stat,
                                               whole_next_action)
            # scale_reward: to scale reward in Q functions
            batch_reward = batch_reward.view(64, -1)

            s_1_target_Q = (s_target_Q * self.GAMMA) + (
                batch_reward[:, 0].unsqueeze(1) * self.scale_reward)
            s_2_target_Q = (s_target_Q * self.GAMMA) + (
                batch_reward[:, 1].unsqueeze(1) * self.scale_reward)
            g_1_target_Q = (g_target_Q * self.GAMMA) + (
                batch_reward[:, 2].unsqueeze(1) * self.scale_reward)
            g_2_target_Q = (g_target_Q * self.GAMMA) + (
                batch_reward[:, 3].unsqueeze(1) * self.scale_reward)
            # 64 *1

            # # #
            # Update first striker #
            # # #
            s_1_loss_Q = nn.MSELoss()(s_current_Q, s_1_target_Q.detach())
            s_1_loss_Q.backward(retain_graph=True)
            self.critic_optimizer[0].step()

            # # #
            # Update 2nd striker #
            # # #
            # print(s_2_target_Q)
            self.critic_optimizer[0].zero_grad()
            s_2_loss_Q = nn.MSELoss()(s_current_Q, s_2_target_Q.detach())
            s_2_loss_Q.backward()
            self.critic_optimizer[0].step()

            # # #
            # Update first goalie #
            # # #
            self.critic_optimizer[1].zero_grad()
            g_1_loss_Q = nn.MSELoss()(g_current_Q, g_1_target_Q.detach())
            g_1_loss_Q.backward(retain_graph=True)
            self.critic_optimizer[1].step()

            # # #
            # Update 2nd goalie #
            # # #
            self.critic_optimizer[1].zero_grad()
            g_2_loss_Q = nn.MSELoss()(g_current_Q, g_2_target_Q.detach())
            g_2_loss_Q.backward()
            self.critic_optimizer[1].step()

            self.s_actor_optimizer[agent_index].zero_grad()
            self.g_actor_optimizer[agent_index].zero_grad()
            # state_i = state_batch[:, agent_index, :]
            # action_i = self.actors[agent_index](state_i)

            s_state = batch_state[:, :, 0:2, :]
            s_state = torch.from_numpy(s_state).to(self.device).float()
            s_state = s_state.view(total_numbers_of_data * 2, -1)
            g_state = batch_state[:, :, 2:4, :]
            g_state = torch.from_numpy(g_state).to(self.device).float()
            g_state = g_state.view(total_numbers_of_data * 2, -1)
            s_action = self.s_actor[agent_index](s_state)
            g_action = self.g_actor[agent_index](g_state)
            # # #
            # striker #
            # #
            s_ac = one_hot.clone()  # 8*8 * 7
            g_ac = one_hot.clone()
            s_action = s_action.view(-1, 1, 7).to(self.device)
            g_action = g_action.view(-1, 1, 7).to(self.device)
            # print(s_action.shape)
            s_ac = s_ac.view(-1, 2, 7)
            # print(s_ac.shape)
            s_ac[:, 0] = s_action.squeeze()
            g_ac = g_ac.view(-1, 2, 7)
            g_ac[:, 1] = g_action.squeeze()
            sup_action = s_ac.view(total_numbers_of_data, -1)
            gup_action = g_ac.view(total_numbers_of_data, -1)
            sactor_loss = -self.critic[0](whole_state, sup_action)
            sactor_loss = sactor_loss.mean()
            sactor_loss.backward()
            gactor_loss = -self.critic[1](whole_state, gup_action)
            gactor_loss = gactor_loss.mean()
            gactor_loss.backward()
            self.s_actor_optimizer[agent_index].step()
            self.g_actor_optimizer[agent_index].step()
            # # #
            # goalie #
            # #
            c_loss.append(s_1_loss_Q + s_2_loss_Q + g_1_loss_Q + g_2_loss_Q)
            a_loss.append(sactor_loss + gactor_loss)

        if self.steps_done % 100 == 0 and self.steps_done > 0:
            soft_update(self.critic_target[0], self.critic[0], self.tau)
            soft_update(self.s_actors_target[0], self.s_actor[0], self.tau)
            soft_update(self.critic_target[1], self.critic[1], self.tau)
            soft_update(self.g_actors_target[0], self.g_actor[0], self.tau)

        return c_loss, a_loss
예제 #12
0
class DQNAgent:
    def __init__(self, max_action: int, training=False):
        self.max_action = max_action
        self.sess = None

    def reset_state(self, observation: np.ndarray):
        self.stacked_frames, self.state = stack_frames(
            deque([], maxlen=self.stack_size), observation, self.frame_size,
            self.stack_size)

    def do_setup(self, args: Dict, observation: np.ndarray,
                 session: tf.Session):
        self.frame_size = args.frame_size
        self.stack_size = args.stack_size

        self.mem = Experience(capacity=args.mem_capacity)
        self.dqn = DQN((*args.frame_size, args.stack_size), self.max_action,
                       args.learning_rate, "ex")
        self.stacked_frames, self.state = stack_frames(
            deque([], maxlen=self.stack_size), observation, args.frame_size,
            args.stack_size)

        self.sess = session
        self.sess.run(tf.global_variables_initializer())

    def remember(self, observation: np.ndarray, action: int, reward: float):
        """
        Add current observation to the stack of frames and create a memory
        entry corresponding to this tuple. Also update the internal state of
        the agent.
        """
        self.stacked_frames, next_state = stack_frames(self.stacked_frames,
                                                       observation,
                                                       self.frame_size,
                                                       self.stack_size)
        self.mem.add((self.state, action, reward, next_state))
        self.state = next_state

    def get_random_action(self):
        return np.random.randint(self.max_action)

    # TODO: move explore_prob_* and decay rate inside the agent
    # This should be used when the agent is still in training.
    def predict_action(self, explore_prob_begin: float,
                       explore_prob_min: float, decay_rate: float,
                       decay_step: int):
        explore_prob_curr = explore_prob_min + \
            (explore_prob_begin - explore_prob_min) * \
            np.exp(-decay_rate * decay_step)
        if np.random.rand() < explore_prob_curr:
            action = self.get_random_action()
        else:
            Qs = self.sess.run(self.dqn.output,
                               feed_dict={
                                   self.dqn.inputs_:
                                   self.state.reshape(1, *self.state.shape)
                               })
            action = int(np.argmax(Qs))
            print('action: %d' % action)

        return action, explore_prob_curr

    def act(self, observation: np.ndarray):
        """
        :param observation: numpy array of shape (width, height, 3) *defined in config file
        :return: int between 0 and max_action
        This method should be called when the agent is already trained.
        """

        if self.sess is None:
            # Used some hardcoded parameters here, sorryyy.
            session = tf.Session()
            self.dqn = DQN((*(84, 84), 4), self.max_action, 0.002, "ex")
            saver = tf.train.Saver()
            saver.restore(session, './models/second_model.ckpt')
            self.sess = session
            self.cnt = 0
            self.stacked_frames, self.state = stack_frames(
                deque([], maxlen=4), observation, (84, 84), 4)

        # Used to visualize the game when testing the model.
        cv2.imwrite('game_' + str(self.cnt) + '.png', observation)
        self.cnt += 1

        self.stacked_frames, self.state = stack_frames(self.stacked_frames,
                                                       observation, (84, 84),
                                                       4)

        Qs = self.sess.run(self.dqn.output,
                           feed_dict={
                               self.dqn.inputs_:
                               self.state.reshape(1, *self.state.shape)
                           })
        return int(np.argmax(Qs))
    def learn(self):
        """ Update policy and value parameters using given batch of experience tuples"""
        if self.eps_done <= self.eps_b_train:
            return None, None

        if self.eps_done == (self.eps_b_train + 1):
            print("========== Training now =========")

        ByteTensor = th.cuda.ByteTensor if self.cuda_on else th.ByteTensor
        FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor

        c_loss = []
        a_loss = []

        for agent in range(self.n_agents):
            transitions = self.memory.sample(self.batch_size)
            batch = Experience(*zip(*transitions))

            non_final_mask = ByteTensor(list(map(lambda s: s is not None,
                                                 batch.next_states)))
            # state_batch: batch_size x n_agents x dim_obs
            state_batch = th.stack(batch.states).type(FloatTensor)
                  
            reward_batch = th.stack(batch.rewards).type(FloatTensor)
            action_batch = th.stack(batch.actions).type(FloatTensor)
            #pdb.set_trace()
            # : (batch_size_non_final) x n_agents x dim_obs
            non_final_next_states = th.stack(
                [s for s in batch.next_states
                 if s is not None]).type(FloatTensor)

            # for current agent
            whole_state = state_batch.view(self.batch_size, -1)
            whole_action = action_batch.view(self.batch_size, -1)
            self.critic_optimizer[agent].zero_grad()
            current_Q = self.critics[agent](whole_state, whole_action)

            non_final_next_actions = [
                self.actors_target[i](non_final_next_states[:,
                                                            i,
                                                            :]) for i in range(
                                                                self.n_agents)]
            non_final_next_actions = th.stack(non_final_next_actions)
            non_final_next_actions = (
                non_final_next_actions.transpose(0,
                                                 1).contiguous())

            target_Q = th.zeros(
                self.batch_size).type(FloatTensor)

            target_Q[non_final_mask] = self.critics_target[agent](
                non_final_next_states.view(-1, self.n_agents * self.dim_obs),
                non_final_next_actions.view(-1,
                                            self.n_agents * self.dim_act)
            ).squeeze()
            # scale_reward: to scale reward in Q functions

            target_Q = (target_Q.unsqueeze(1) * GAMMA) + (
                reward_batch[:, agent].unsqueeze(1) * SCALE_REWARD)

            loss_Q = nn.MSELoss()(current_Q, target_Q.detach())
            loss_Q.backward()
            self.critic_optimizer[agent].step()

            self.actor_optimizer[agent].zero_grad()
            state_i = state_batch[:, agent, :]
            action_i = self.actors[agent](state_i)
            ac = action_batch.clone()
            ac[:, agent, :] = action_i
            whole_action = ac.view(self.batch_size, -1)
            actor_loss = -self.critics[agent](whole_state, whole_action)
            actor_loss = actor_loss.mean()
            actor_loss.backward()
            self.actor_optimizer[agent].step()
            c_loss.append(loss_Q)
            a_loss.append(actor_loss)


        #if self.steps_done % NUM_STEPS_TO_UPDATE == 0 and self.steps_done > 0:
            #for i in range(self.n_agents):
            soft_update(self.critics_target[agent], self.critics[agent], TAU)
            soft_update(self.actors_target[agent], self.actors[agent], TAU)

        return c_loss, a_loss
예제 #14
0
    def train(self):
        print('-' * 60)
        print('-' * 60)
        print(
            'PHASE I: Initial Intrinsic Motivation Learning and Subgoal Discovery'
        )
        print('Purpose 1) Training Controller to reach random locations')
        print('Purpose 2) Discovering subgoals')
        print('-' * 60)
        print('-' * 60)

        # reset
        print('-' * 60)
        print('game episode: ', self.game_episode)
        print('time step: ', self.step)
        S = self.env.reset()
        s = four_frames_to_4_84_84(S)
        man_mask = self.image_processor.get_man_mask(S)
        man_loc = get_man_xy_np_coordinate(man_mask)
        g_id, subgoal_mask = self.image_processor.sample_from_random_subgoal_set(
        )  # random g
        print('new subgoal assigned, g_id = ', g_id)
        subgoal_frame = self.image_processor.create_mask_frame(subgoal_mask)
        g = single_channel_frame_to_1_84_84(subgoal_frame)
        for t in range(self.max_iter + 1):
            self.step = t
            if t < self.learning_starts:
                a = self.env.action_space.sample()
            else:
                a = self.epsilon_greedy(s, g)

            old_lives = self.env.lives()
            SP, r, terminal, step_info = self.env.step(a)
            new_lives = self.env.lives()
            self.episode_scores += r
            sp = four_frames_to_4_84_84(SP)
            man_mask = self.image_processor.get_man_mask(SP)
            man_loc = get_man_xy_np_coordinate(man_mask)
            intrinsic_done_task = is_man_inside_subgoal_mask(
                man_mask, subgoal_mask)

            # outlier for the subgoal discovery
            if r > 0:
                print('############# found an outlier ###############')
                self.subgoal_discovery.push_outlier(man_loc)
            else:
                r = -0.1  # small negative reward

            if intrinsic_done_task:
                intrinsic_done = 1  # binary mask
                print('succesful intrinsic motivation learning to g_id = ',
                      g_id)
                r_tilde = +1.0
                self.intrinsic_motivation_learning_episode += 1
            else:
                intrinsic_done = 0
                r_tilde = -0.1  # small negetive reward to motivate agent to solve task

            if new_lives < old_lives:
                print('agent died, current lives = ', new_lives)
                r = -1.0
                r_tilde = -1.0  # dying reward

            if r > 100:  # it means solving room #1 which in our paper is equivalent to task comelition
                task_done = True
                done = 1  # binary mask for done
                print('The room #1 task is completed, needs to reset!')
            else:
                task_done = False
                done = 0

            if terminal:
                print('agent terminated, end of episode')
                r = -10.0

            self.episode_rewards += r  # including negative rewards for death

            r = np.clip(r, -1.0, 1.0)
            experience = Experience(s, g, g_id, a, r, r_tilde, sp,
                                    intrinsic_done, done, man_loc)
            self.experience_memory.push(experience)

            s = copy.deepcopy(sp)
            self.anneal_epsilon()

            if intrinsic_done_task:  # reset subgoal when intrinsic motivation task is accomplished
                g_id, subgoal_mask = self.image_processor.sample_from_random_subgoal_set(
                )  # random g
                print('new subgoal assigned, g_id = ', g_id)
                subgoal_frame = self.image_processor.create_mask_frame(
                    subgoal_mask)
                g = single_channel_frame_to_1_84_84(subgoal_frame)

            if (new_lives < old_lives
                ) and not terminal and self.repeat_noop_action > 0:
                for _ in range(self.repeat_noop_action
                               ):  # do 20 nothing actions to ignore post-death
                    S, _, _, _ = self.env.step(0)
                s = four_frames_to_4_84_84(S)

            if terminal or task_done:
                self.episode_scores_list.append(self.episode_scores)
                self.episode_rewards_list.append(self.episode_rewards)
                self.game_episode += 1
                self.episode_rewards = 0.0
                self.episode_scores = 0.0
                print('-' * 60)
                print('game episode: ', self.game_episode)
                print('time step: ', self.step)
                S = self.env.reset()  # reset S
                s = four_frames_to_4_84_84(S)  # get s
                man_mask = self.image_processor.get_man_mask(S)  # man's mask
                man_loc = get_man_xy_np_coordinate(
                    man_mask)  # man's np location
                g_id, subgoal_mask = self.image_processor.sample_from_random_subgoal_set(
                )  # id and mask of random subgoal
                print('new subgoal assigned, g_id = ', g_id)
                subgoal_frame = self.image_processor.create_mask_frame(
                    subgoal_mask)  #subgoal frame
                g = single_channel_frame_to_1_84_84(subgoal_frame)

            if (t > self.learning_starts) and (t % self.learning_freq == 0):
                self.controller.update_w()

            if (t > 0) and (t % self.subgoal_discovery_freq
                            == 0):  # find centroids
                X = self.experience_memory.get_man_positions()
                self.subgoal_discovery.feed_data(X)
                self.subgoal_discovery.find_kmeans_clusters()
                results_file_path = './results/subgoal_discovery_step_' + str(
                    t) + '.pkl'
                self.subgoal_discovery.save_results(
                    results_file_path=results_file_path)

            if (t > self.learning_starts) and (
                    t % self.test_freq == 0):  # test controller's performance
                self.test()

            if (t > 0) and (t % self.save_model_freq
                            == 0):  # save controller model
                model_save_path = './models/controller_step_' + str(
                    t) + '.model'
                self.controller.save_model(model_save_path)
                print('saving model, steps = ', t)

            if (t > 0) and (t % self.save_results_freq == 0):
                results_file_path = './results/performance_results_' + str(
                    t) + '.pkl'
                with open(results_file_path, 'wb') as f:
                    pickle.dump([
                        self.episode_scores_list, self.episode_rewards_list,
                        self.testing_scores
                    ], f)

            if (t > self.learning_starts) and (
                    t % self.controller_target_update_freq == 0):
                self.controller.update_target_params()
예제 #15
0
파일: maddpg.py 프로젝트: zwfightzw/MLM
    def update_policy(self, i_episode, initial_train):
        # do not train until exploration is enough
        if self.episode_done <= self.episodes_before_train:
            return None, None

        ByteTensor = th.cuda.ByteTensor if self.use_cuda else th.ByteTensor
        FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor

        c_loss = []
        a_loss = []

        for agent in range(self.n_agents):
            transitions = self.memory.sample(self.batch_size)
            batch = Experience(*zip(*transitions))

            non_final_mask = ByteTensor(
                list(map(lambda s: s is not None, batch.next_states)))
            # state_batch: batch_size x n_agents x dim_obs
            state_batch = Variable(th.stack(batch.states).type(FloatTensor))

            action_batch = Variable(th.stack(batch.actions).type(FloatTensor))
            reward_batch = Variable(th.stack(batch.rewards).type(FloatTensor))

            s = [s for s in batch.next_states if s is not None]
            non_final_next_states = Variable(th.stack(s).type(FloatTensor))

            #tmp_whole_state = state_batch[:, (1, 4), :]
            initial_train = initial_train
            if agent == 4:
                tmp_state = Variable(
                    th.zeros(self.batch_size, 5, 22).type(FloatTensor))
                tmp_action = Variable(
                    th.zeros(self.batch_size, 5, 2).type(FloatTensor))
                tmp_no_final_s = Variable(
                    th.zeros(len(non_final_next_states), 5,
                             22).type(FloatTensor))
                non_final_next_actions = Variable(
                    th.zeros(len(non_final_next_states), 5,
                             2).type(FloatTensor))
            else:
                tmp_state = Variable(
                    th.zeros(self.batch_size, 4, 22).type(FloatTensor))
                tmp_action = Variable(
                    th.zeros(self.batch_size, 4, 2).type(FloatTensor))
                tmp_no_final_s = Variable(
                    th.zeros(len(non_final_next_states), 4,
                             22).type(FloatTensor))
                non_final_next_actions = Variable(
                    th.zeros(len(non_final_next_states), 4,
                             2).type(FloatTensor))

            non_final_next_actions_tmp = [  # [torch.FloatTensor of size 989x2]
                self.actors_target[i](
                    non_final_next_states[:,  # [torch.FloatTensor of size 989x213]
                                          i, :]) for i in range(self.n_agents)
            ]
            non_final_next_actions_tmp = Variable(
                th.stack(non_final_next_actions_tmp).type(FloatTensor))
            non_final_next_actions_tmp = (non_final_next_actions_tmp.transpose(
                0, 1).contiguous())

            startTime = datetime.datetime.now()
            # the main difference between the double values of initial_train is: when initial_train is True, the input of the critic network
            # is all agents' observation, while it is False, the input is only the nearest four agents observation of the now agent
            if initial_train is False:
                #non_final_next_actions = []
                for j in range(self.batch_size):
                    if j < len(non_final_next_states):
                        tmp_no_final_s[j, 0:4, :] = non_final_next_states[j, ([
                            i for i in range(self.n_agents)
                            if i != batch.max_id[j][agent]
                        ]), :]
                        #non_final_next_actions[j,:,:] = Variable(th.stack([self.actors_target[i](non_final_next_states[j, i, :]) for i in range(self.n_agents) if i!=batch.max_id[j][agent]]).type(FloatTensor))
                        non_final_next_actions[
                            j, 0:4, :] = non_final_next_actions_tmp[j, ([
                                i for i in range(self.n_agents)
                                if i != batch.max_id[j][agent]
                            ]), :]

                    tmp_state[j, 0:4, :] = state_batch[j, ([
                        i for i in range(self.n_agents)
                        if i != batch.max_id[j][agent]
                    ]), :]
                    tmp_action[j, 0:4, :] = action_batch[j, ([
                        i for i in range(self.n_agents)
                        if i != batch.max_id[j][agent]
                    ]), :]

            if agent == 4:
                tmp_state[:, 4, :] = tmp_state[:, 3, :]
                tmp_action[:, 4, :] = tmp_action[:, 3, :]
                tmp_no_final_s[:, 4, :] = tmp_no_final_s[:, 3, :]
                non_final_next_actions[:, 4, :] = non_final_next_actions[:,
                                                                         3, :]

            if initial_train is True:
                tmp_state = state_batch
                tmp_action = action_batch
                tmp_no_final_s = non_final_next_states

            #whole_state = state_batch.view(self.batch_size, -1)
            #print('-----------------------------')
            whole_state = tmp_state.view(self.batch_size, -1)
            # print('whole_state',whole_state)  [torch.FloatTensor of size 100x62]
            whole_action = tmp_action.view(self.batch_size, -1)
            # non_final_next_states = non_final_next_states.view(next_state_count,-1)
            # print('non_final_next_states',non_final_next_states)
            self.critic_optimizer[agent].zero_grad()

            current_Q = self.critics[agent](whole_state, whole_action)

            if initial_train is True:
                non_final_next_actions = [  # [torch.FloatTensor of size 989x2]
                    self.actors_target[i](
                        tmp_no_final_s[:,  # [torch.FloatTensor of size 989x213]
                                       i, :]) for i in range(self.n_agents)
                ]
                non_final_next_actions = Variable(
                    th.stack(non_final_next_actions).type(FloatTensor))

            target_Q = Variable(th.zeros(self.batch_size, 1).type(FloatTensor))
            # print('non_final_mask',non_final_mask)

            if initial_train is True:
                target_Q[non_final_mask] = self.critics_target[agent](
                    tmp_no_final_s.view((-1, self.n_agents * self.n_states)),
                    non_final_next_actions.view(
                        (-1, self.n_agents * self.n_actions)))
            else:
                if agent != 4:
                    target_Q[non_final_mask] = self.critics_target[agent](
                        tmp_no_final_s.view(
                            (-1, (self.n_agents - 1) * self.n_states)),
                        non_final_next_actions.view(
                            (-1, (self.n_agents - 1) * self.n_actions)))
                else:
                    target_Q[non_final_mask] = self.critics_target[agent](
                        tmp_no_final_s.view(
                            (-1, self.n_agents * self.n_states)),
                        non_final_next_actions.view(
                            (-1, self.n_agents * self.n_actions)))

            # scale_reward: to scale reward in Q functions
            target_Q = (target_Q * self.GAMMA) + (
                reward_batch[:, agent].reshape(self.batch_size, 1) *
                scale_reward)

            loss_Q = nn.MSELoss()(current_Q, target_Q.detach())
            loss_Q.backward()
            self.critic_optimizer[agent].step()

            self.actor_optimizer[agent].zero_grad()
            state_i = state_batch[:, agent, :]

            action_i = self.actors[agent](state_i) * 4
            ac = tmp_action.clone()
            if initial_train is False and agent != 4:
                for j in range(self.batch_size):
                    if agent < batch.max_id[j][agent]:
                        tmp_agent = agent
                    else:
                        tmp_agent = agent - 1
                    ac[j, tmp_agent, :] = action_i[j]
            if agent == 4:
                ac[:, 4, :] = action_i
                ac[:, 3, :] = action_i

            if initial_train is True:
                ac[:, agent, :] = action_i
            whole_action = ac.view(self.batch_size, -1)
            actor_loss = -self.critics[agent](whole_state, whole_action)
            actor_loss = actor_loss.mean()
            actor_loss.backward()
            self.actor_optimizer[agent].step()
            c_loss.append(loss_Q)
            a_loss.append(actor_loss)

        if self.steps_done % 100 == 0 and self.steps_done > 0:
            for i in range(self.n_agents):
                soft_update(self.critics_target[i], self.critics[i], self.tau)
                soft_update(self.actors_target[i], self.actors[i], self.tau)

        return c_loss, a_loss
예제 #16
0
    def update_policy(self):
        # do not train until exploration is enough
        if self.episode_done <= self.episodes_before_train:
            return None, None

        ByteTensor = th.cuda.ByteTensor if self.use_cuda else th.ByteTensor
        FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor

        c_loss = []
        a_loss = []

        critics_grad = []
        actors_grad = []
        for agent in range(self.n_agents):
            transitions = self.memory.sample(self.batch_size)
            batch = Experience(*zip(*transitions))
            non_final_mask = ByteTensor(list(map(lambda s: s is not None,
                                                 batch.next_states)))
            # state_batch: batch_size x n_agents x dim_obs
            state_batch = Variable(th.stack(batch.states).type(FloatTensor))
            action_batch = Variable(th.stack(batch.actions).type(FloatTensor))
            reward_batch = Variable(th.stack(batch.rewards).type(FloatTensor))
            # : (batch_size_non_final) x n_agents x dim_obs
            non_final_next_states = Variable(th.stack(
                [s for s in batch.next_states if s is not None]).type(FloatTensor))

            # for current agent
            whole_state = state_batch.view(self.batch_size, -1)
            whole_action = action_batch.view(self.batch_size, -1)

            # critic network
            self.critic_optimizer[agent].zero_grad()
            current_Q = self.models.critics[agent](whole_state, whole_action)   # forward?

            non_final_next_actions = [
                self.actors_target[i](non_final_next_states[:, i, :]) for i in range(self.n_agents)]
            non_final_next_actions = th.stack(non_final_next_actions)
#            non_final_next_actions = Variable(non_final_next_actions)
            non_final_next_actions = (
                non_final_next_actions.transpose(0, 1).contiguous())

            target_Q = Variable(th.zeros(self.batch_size).type(FloatTensor))
            target_Q[non_final_mask] = self.critics_target[agent](
                non_final_next_states.view(-1, self.n_agents * self.n_states),
                non_final_next_actions.view(-1, self.n_agents * self.n_actions))

            # scale_reward: to scale reward in Q functions
            target_Q = (target_Q * self.GAMMA) + (reward_batch[:, agent] * scale_reward)

            loss_Q = nn.MSELoss()(current_Q, target_Q.detach())
            loss_Q.backward()

            self.critic_optimizer[agent].step()

            # actor network
            self.actor_optimizer[agent].zero_grad()
            state_i = state_batch[:, agent, :]
            action_i = self.models.actors[agent](state_i)   # forward
            ac = action_batch.clone()
            ac[:, agent, :] = action_i
            whole_action = ac.view(self.batch_size, -1)
            actor_loss = -self.models.critics[agent](whole_state, whole_action)     # forward
            actor_loss = actor_loss.mean()
            actor_loss.backward()
            self.actor_optimizer[agent].step()

            c_loss.append(loss_Q)
            a_loss.append(actor_loss)

            # for test
            '''
            s = 0
            for x in self.models.critics[agent].parameters():
                s += 1
                print('s: ', s)
                print(type(x))
                print('x.grad.shape: ', x.grad.size())
                print('x.data.shape: ', x.data.size())
            '''
            critics_agent_grad = []
            actors_agent_grad = []
            for x in self.models.critics[agent].parameters():
                critics_agent_grad.append(x.grad.data.norm(2))
                # critics_agent_grad.append(th.mean(x.grad).data[0])
            for x in self.models.actors[agent].parameters():
                actors_agent_grad.append(x.grad.data.norm(2))
                # actors_agent_grad.append(th.mean(x.grad).data[0])

            critics_grad.append(critics_agent_grad)
            actors_grad.append(actors_agent_grad)


        if self.steps_done % 100 == 0 and self.steps_done > 0:
            for i in range(self.n_agents):
                soft_update(self.critics_target[i], self.models.critics[i], self.tau)
                soft_update(self.actors_target[i], self.models.actors[i], self.tau)

        '''
        # gradient clipping
        if self.clip is not None:
            nn.utils.clip_grad_norm(self.model.parameters(), self.clip)
        '''

        # return c_loss, a_loss  #, critics_grad, actors_grad
        return critics_grad, actors_grad
예제 #17
0
	def play(self):
		s = self.s
		if self.step < self.learning_starts:
			a = self.env.action_space.sample()
		else:
			a = self.epsilon_greedy()
		old_lives = self.env.lives()
		SP, r, terminal, step_info = self.env.step(a)
		new_lives = self.env.lives()
		self.episode_scores += r
		sp = self.preprocess_concat_frames(SP)

		if new_lives < old_lives:
			print('agent died, current lives = ', new_lives)
			r = min(-1.0, r)

		if (terminal and new_lives>0):
			task_done = True
			done = 1
			r = max(1.0,r)
			print('task is solved succesfully, end of episode')
		else:
			task_done = False
			done = 0

		if terminal and new_lives==0:
			print('agent terminated, end of episode') 
			r = min(-1.0,r)

		if ('Pong' in self.env.task):
			if terminal and (self.episode_scores > 17):
				task_done = True
				done = 1
				r = max(1.0,r)
				print('task is solved succesfully, end of episode')
			else:
				task_done = False
				done = 0

		# if r < 0.0 or isclose(r, 0.0):
		# 	r = min(-0.01,r)

		self.episode_rewards += r

		experience = Experience(s, a, r, sp, done)
		self.experience_memory.push(experience)
		self.s = copy.deepcopy(sp)

		if terminal or task_done:
			self.episode_steps_list.append(self.step) 
			self.episode_scores_list.append(self.episode_scores)
			self.episode_rewards_list.append(self.episode_rewards)			
			self.episode_end_time = time.time()
			episode_time = self.episode_end_time - self.episode_start_time
			if self.there_was_a_test is True:
				episode_time = episode_time - self.test_duration
				self.there_was_a_test = False
			self.episode_time_list.append(episode_time)
			print('episode score: ', self.episode_scores)
			print('episode time: {0:.2f}' .format(episode_time))

			self.game_episode += 1
			print('-'*60)
			print('game episode: ', self.game_episode)
			print('time step: ', self.step)
			self.episode_rewards = 0.0
			self.episode_scores = 0.0				
			self.episode_start_time = time.time()
			S = self.env.reset() # reset S
			self.s = self.preprocess_concat_frames(S)
예제 #18
0
model.cuda()

optimizer_meta_actor = Adam(model.parameters(), lr=0.001)
optimizer_config_network = Adam(config_network.parameters(), lr=0.001)

for t in range(100000):

    ByteTensor = pt.cuda.ByteTensor if use_cuda else pt.ByteTensor
    FloatTensor = pt.cuda.FloatTensor if use_cuda else pt.FloatTensor

    random_position = np.random.randint(low=length_lstm,
                                        high=min(
                                            memory.__len__(),
                                            n_episode * n_agents * max_steps))
    memory_info = memory.get_item(random_position, length_lstm)
    batch = Experience(*zip(*memory_info))
    state_batch = Variable(pt.stack(batch.states).type(FloatTensor))
    action_batch = Variable(pt.stack(batch.actions).type(FloatTensor))

    for i in range(n_agents):

        optimizer_meta_actor.zero_grad()
        whole_state = state_batch[0:length_lstm - 1,
                                  i, :].view(length_lstm - 1, 22)
        whole_action = action_batch[0:length_lstm - 1, i, :].view(
            length_lstm - 1, 2) / 4
        final_state = state_batch[length_lstm - 1, i, :]
        final_action = action_batch[length_lstm - 1, i, :]

        #pre_data_samples = pt.cat((whole_state, whole_action),1).unsqueeze(0)
        pre_data_samples = whole_state.unsqueeze(0)
 def remember(self, states, actions, rewards, next_states, dones):
     '''Populates the replay memory with new batch of data; observations of all agents'''
     self.memory.add(
         Experience(states, actions, rewards, next_states, dones))
예제 #20
0
    def forward(self, obs, acts):
        result = F.relu(self.FC1(obs))
        combined = pt.cat([result, acts], 1)
        result = F.relu(self.FC2(combined))
        return self.FC4(F.relu(self.FC3(result)))


model = meta_critic(5, 22, 2)
model.cuda()

optimizer = Adam(model.parameters(), lr=0.0001)

for t in range(100000):

    transitions = memory.sample(batch_size)
    batch = Experience(*zip(*transitions))
    optimizer.zero_grad()

    state_batch = Variable(pt.stack(batch.states).type(FloatTensor))
    action_batch = Variable(pt.stack(batch.actions).type(FloatTensor))

    Q_batch = Variable(pt.stack(batch.rewards).type(FloatTensor))

    whole_state = state_batch.view(batch_size, -1)
    whole_action = action_batch.view(batch_size, -1)
    whole_Q = Q_batch.view(batch_size, -1)

    prediction = model(whole_state, whole_action)

    #target_Q = Variable(pt.zeros(batch_size, 1).type(FloatTensor))
예제 #21
0
                (subgoal_mask.x,subgoal_mask.y) )
        intrinsic_done = 1
        tilde_r = +1
        subgoal_index, subgoal_mask = \
         sample_from_random_subgoal_set(random_subgoals_set)
        subgoal_frame = create_mask_frame(base_img, subgoal_mask)
    else:
        intrinsic_done = 0
        tilde_r = -1

    if terminal and env.unwrapped.ale.lives() > 0:
        done = 1
    else:
        done = 0

    experience = Experience(s, g, g_id, a, r, tilde_r, sp, intrinsic_done,
                            done, man_loc)
    experience_memory.push(experience)

    epsilon = max(0.1, 1 - (1 - 0.1) * t / 1000000)
    s = deepcopy(sp)
    steps += 1

    if terminal or (steps > MAX_STEPS):
        S = reset()  # s is reserved for 4*84*84 input image
        s = four_frames_to_4_84_84(S)
        man_mask = get_man_mask(S)
        man_loc = get_man_xy_np_coordinate(man_mask)
        subgoal_index, subgoal_mask = sample_from_random_subgoal_set(
            random_subgoals_set)  # random g
        subgoal_frame = create_mask_frame(base_img, subgoal_mask)
        g = single_channel_frame_to_1_84_84(subgoal_frame)
예제 #22
0
    def update_policy(self):
        # do not train until exploration is enough
        if self.episode_done <= self.episodes_before_train:
            return None, None

        ByteTensor = th.cuda.ByteTensor if self.use_cuda else th.ByteTensor
        FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor

        # actor Loss
        c_loss = []
        # critic Loss
        a_loss = []
        # 循环,对于每一个agent提取transitions
        for agent in range(self.n_agents):
            # 提取过渡态
            transitions = self.memory.sample(self.batch_size)
            # 利用*号操作符,可以将元组解压为列表. *transitions将transtions解压为列表
            # zip(*transitions) 得到的结果是[(state1, state2), (action1, action2), (next_state1, next_state2), (reward1, reward2)]
            # batch = Experience(states=(1, 5), actions=(2, 6), next_states=(3, 7), rewards=(4, 8))
            batch = Experience(*zip(*transitions))
            # 是否终止状态
            # list(map(...))返回的数值: [True, True]
            # ByteTensor后返回的数值tensor([1, 1], dtype=torch.uint8)
            non_final_mask = ByteTensor(
                list(map(lambda s: s is not None, batch.next_states)))
            # state_batch: batch_size x n_agents x dim_obs
            state_batch = Variable(th.stack(batch.states).type(FloatTensor))
            action_batch = Variable(th.stack(batch.actions).type(FloatTensor))
            reward_batch = Variable(th.stack(batch.rewards).type(FloatTensor))
            # : (batch_size_non_final) x n_agents x dim_obs
            non_final_next_states = Variable(
                th.stack([s for s in batch.next_states
                          if s is not None]).type(FloatTensor))

            # for current agent
            # 使用view重新塑形
            # whole_state的格式为([batch_size, n_agents ✖ dim_obs])
            whole_state = state_batch.view(self.batch_size, -1)
            whole_action = action_batch.view(self.batch_size, -1)

            # 把critic优化器梯度置零,也就是把loss关于weight的导数变成0
            self.critic_optimizer[agent].zero_grad()
            # 当前的Q值, 使用当前critic来进行评估
            current_Q = self.critics[agent](whole_state, whole_action)

            non_final_next_actions = [
                self.actors_target[i](non_final_next_states[:, i, :])
                for i in range(self.n_agents)
            ]
            non_final_next_actions = th.stack(non_final_next_actions)
            # transpose: 交换维度0和1,即转置
            # contiguous操作保证张量是连续的,方便后续的view操作
            non_final_next_actions = (non_final_next_actions.transpose(
                0, 1).contiguous())

            # TODO: 对此处代码不深究,涉及到数学内容,直接套用
            # target_Q初始化
            target_Q = th.zeros(self.batch_size).type(FloatTensor)

            target_Q[non_final_mask] = self.critics_target[agent](
                non_final_next_states.view(-1, self.n_agents * self.n_states),
                non_final_next_actions.view(-1, self.n_agents *
                                            self.n_actions)).squeeze()
            # scale_reward: to scale reward in Q functions

            target_Q = (target_Q.unsqueeze(1) * self.GAMMA) + (
                reward_batch[:, agent].unsqueeze(1) * scale_reward)

            loss_Q = nn.MSELoss()(current_Q, target_Q.detach())
            loss_Q.backward()
            self.critic_optimizer[agent].step()

            self.actor_optimizer[agent].zero_grad()
            state_i = state_batch[:, agent, :]
            action_i = self.actors[agent](state_i)
            ac = action_batch.clone()
            ac[:, agent, :] = action_i
            whole_action = ac.view(self.batch_size, -1)
            actor_loss = -self.critics[agent](whole_state, whole_action)
            actor_loss = actor_loss.mean()
            actor_loss.backward()
            self.actor_optimizer[agent].step()
            c_loss.append(loss_Q)
            a_loss.append(actor_loss)

        if self.steps_done % 100 == 0 and self.steps_done > 0:
            for i in range(self.n_agents):
                soft_update(self.critics_target[i], self.critics[i], self.tau)
                soft_update(self.actors_target[i], self.actors[i], self.tau)

        return c_loss, a_loss
예제 #23
0
    def update_policy(self):
        # do not train until exploration is enough
        if self.episode_done <= self.episodes_before_train:
            return None, None

        ByteTensor = th.cuda.ByteTensor if self.use_cuda else th.ByteTensor
        FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor

        c_loss = []
        a_loss = []
        for agent in range(self.n_agents):
            transitions = self.memory.sample(self.batch_size)
            batch = Experience(*zip(*transitions))
            non_final_mask = ByteTensor(
                list(map(lambda s: s is not None, batch.next_states)))
            # state_batch: batch_size x n_agents x dim_obs
            state_batch = th.stack(batch.states).type(FloatTensor)
            action_batch = th.stack(batch.actions).type(FloatTensor)
            reward_batch = th.stack(batch.rewards).type(FloatTensor)
            # : (batch_size_non_final) x n_agents x dim_obs
            non_final_next_states = th.stack([
                s for s in batch.next_states if s is not None
            ]).type(FloatTensor)

            # for current agent
            whole_state = state_batch.view(self.batch_size, -1)
            whole_action = action_batch.view(self.batch_size, -1)
            self.critic_optimizer[agent].zero_grad()
            current_Q = self.critics[agent](whole_state, whole_action)

            non_final_next_actions = [
                self.actors_target[i](non_final_next_states[:, i, :])
                for i in range(self.n_agents)
            ]
            non_final_next_actions = th.stack(non_final_next_actions)
            non_final_next_actions = (non_final_next_actions.transpose(
                0, 1).contiguous())

            target_Q = th.zeros(self.batch_size).type(FloatTensor)

            target_Q[non_final_mask] = self.critics_target[agent](
                non_final_next_states.view(-1, self.n_agents * self.n_states),
                non_final_next_actions.view(-1, self.n_agents *
                                            self.n_actions)).squeeze()
            # scale_reward: to scale reward in Q functions

            target_Q = (target_Q.unsqueeze(1) * self.GAMMA) + (
                reward_batch[:, agent].unsqueeze(1) * scale_reward)

            loss_Q = nn.MSELoss()(current_Q, target_Q.detach())
            loss_Q.backward()
            self.critic_optimizer[agent].step()

            self.actor_optimizer[agent].zero_grad()
            state_i = state_batch[:, agent, :]
            action_i = self.actors[agent](state_i)
            ac = action_batch.clone()
            ac[:, agent, :] = action_i
            whole_action = ac.view(self.batch_size, -1)
            actor_loss = -self.critics[agent](whole_state, whole_action)
            actor_loss = actor_loss.mean()
            actor_loss.backward()
            self.actor_optimizer[agent].step()
            c_loss.append(loss_Q)
            a_loss.append(actor_loss)

        if self.steps_done % 100 == 0 and self.steps_done > 0:
            for i in range(self.n_agents):
                soft_update(self.critics_target[i], self.critics[i], self.tau)
                soft_update(self.actors_target[i], self.actors[i], self.tau)

        return c_loss, a_loss
예제 #24
0
class DDPG(object):
    def __init__(self,
                 env,
                 mem_size=7 * int(1e3),
                 lr_critic=1e-3,
                 lr_actor=1e-4,
                 epsilon=1.,
                 max_epi=1500,
                 epsilon_decay=1. / (1e5),
                 gamma=.99,
                 target_update_frequency=200,
                 batch_size=64,
                 random_process=True,
                 max_step=None):
        self.CUDA = torch.cuda.is_available()

        self.orig_env = env  #for recording
        if max_step is not None:
            self.orig_env._max_episode_steps = max_step
        self.env = self.orig_env
        self.N_S = self.env.observation_space.shape[0]
        self.N_A = self.env.action_space.shape[0]
        self.MAX_EPI = max_epi
        self.LOW = self.env.action_space.low
        self.HIGH = self.env.action_space.high

        self.actor = Actor(self.N_S, self.N_A)
        self.critic = Critic(self.N_S, self.N_A)
        self.target_actor = Actor(self.N_S, self.N_A)
        self.target_critic = Critic(self.N_S, self.N_A)
        self.target_actor.eval()
        self.target_critic.eval()
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())
        if self.CUDA:
            self.actor.cuda()
            self.critic.cuda()
            self.target_actor.cuda()
            self.target_critic.cuda()

        self.exp = Experience(mem_size)
        self.optim_critic = optim.Adam(self.critic.parameters(), lr=lr_critic)
        self.optim_actor = optim.Adam(self.actor.parameters(), lr=-lr_actor)
        self.random_process = OrnsteinUhlenbeckProcess(\
                size=self.N_A, theta=.15, mu=0, sigma=.2)
        self.EPSILON = epsilon
        self.EPSILON_DECAY = epsilon_decay
        self.GAMMA = gamma
        self.TARGET_UPDATE_FREQUENCY = target_update_frequency
        self.BATCH_SIZE = batch_size

        title = {common.S_EPI: [], common.S_TOTAL_R: []}
        self.data = pd.DataFrame(title)
        self.RAND_PROC = random_process

    def train(self, dir=None, interval=1000):
        if dir is not None:
            self.env = wrappers.Monitor(self.orig_env,
                                        '{}/train_record'.format(dir),
                                        force=True)
            os.mkdir(os.path.join(dir, 'models'))
        update_counter = 0
        epsilon = self.EPSILON
        for epi in trange(self.MAX_EPI, desc='train epi', leave=True):
            self.random_process.reset_states()
            o = self.env.reset()

            counter = 0
            acc_r = 0
            while True:
                counter += 1

                #if dir is not None:
                #    self.env.render()

                a = self.choose_action(o)

                if self.RAND_PROC:
                    a += max(epsilon, 0) * self.random_process.sample()
                    a = np.clip(a, -1., 1.)
                    epsilon -= self.EPSILON_DECAY

                o_, r, done, info = self.env.step(self.map_to_action(a))
                self.exp.push(o, a, r, o_, done)

                if epi > 0:
                    self.update_actor_critic()
                    update_counter += 1
                    if update_counter % self.TARGET_UPDATE_FREQUENCY == 0:
                        self.update_target()

                acc_r += r
                o = o_
                if done:
                    break
            if dir is not None:
                if (epi + 1) % interval == 0:
                    self.save(os.path.join(dir, 'models'),
                              str(epi + 1),
                              save_data=False)
            s = pd.Series([epi, acc_r], index=[common.S_EPI, common.S_TOTAL_R])
            self.data = self.data.append(s, ignore_index=True)

    def choose_action(self, state):
        self.actor.eval()
        s = Variable(torch.Tensor(state)).unsqueeze(0)
        if self.CUDA:
            s = s.cuda()
        a = self.actor(s).data.cpu().numpy()[0].astype('float64')
        self.actor.train()
        return a

    def map_to_action(self, a):
        return (self.LOW + self.HIGH) / 2 + a * (self.HIGH - self.LOW) / 2

    def update_target(self):
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

    def update_actor_critic(self):
        # sample minibatch
        minibatch = common.Transition(*zip(*self.exp.sample(self.BATCH_SIZE)))
        bat_o = Variable(torch.Tensor(minibatch.state))
        bat_a = Variable(torch.Tensor(minibatch.action))
        bat_r = Variable(torch.Tensor(minibatch.reward)).unsqueeze(1)
        bat_o_ = Variable(torch.Tensor(minibatch.next_state))
        bat_not_done_mask = list(
            map(lambda done: 0 if done else 1, minibatch.done))
        bat_not_done_mask = Variable(
            torch.ByteTensor(bat_not_done_mask)).unsqueeze(1)
        if self.CUDA:
            bat_o = bat_o.cuda()
            bat_a = bat_a.cuda()
            bat_r = bat_r.cuda()
            bat_o_ = bat_o_.cuda()
            bat_not_done_mask = bat_not_done_mask.cuda()

        # update critic
        bat_a_o_ = self.target_actor(bat_o_)

        Gt = bat_r
        Gt[bat_not_done_mask] += self.GAMMA * self.target_critic(
            bat_o_, bat_a_o_)[bat_not_done_mask]
        Gt.detach_()
        eval_o = self.critic(bat_o, bat_a)
        criterion = nn.MSELoss()
        if self.CUDA:
            criterion.cuda()
        loss = criterion(eval_o, Gt)
        self.optim_critic.zero_grad()
        loss.backward()
        self.optim_critic.step()

        # update actor
        self.critic.eval()
        bat_a_o = self.actor(bat_o)
        obj = torch.mean(self.critic(bat_o, bat_a_o))
        self.optim_actor.zero_grad()
        obj.backward()
        self.optim_actor.step()
        self.critic.train()

    def test(self, dir=None, n=1):
        if dir is not None:
            self.env = wrappers.Monitor(self.orig_env,
                                        '{}/test_record'.format(dir),
                                        force=True,
                                        video_callable=lambda episode_id: True)

        title = {common.S_EPI: [], common.S_TOTAL_R: []}
        df = pd.DataFrame(title)

        for epi in trange(n, desc='test epi', leave=True):
            o = self.env.reset()
            acc_r = 0
            while True:
                #if dir is not None:
                #    self.env.render()
                a = self.choose_action(o)
                o_, r, done, info = self.env.step(self.map_to_action(a))
                acc_r += r
                o = o_
                if done:
                    break
            s = pd.Series([epi, acc_r], index=[common.S_EPI, common.S_TOTAL_R])
            df = df.append(s, ignore_index=True)
        if dir is not None:
            df.to_csv('{}/test_data.csv'.format(dir))
        else:
            print df

    def save(self, dir, suffix='', save_data=True):
        torch.save(self.actor.state_dict(),
                   '{}/actor{}.pt'.format(dir, suffix))
        torch.save(self.critic.state_dict(),
                   '{}/critic{}.pt'.format(dir, suffix))
        if save_data:
            self.data.to_csv('{}/train_data{}.csv'.format(dir, suffix))

    def load_actor(self, dir):
        self.actor.load_state_dict(torch.load(dir))

    def load_critic(self, dir):
        self.critic.load_state_dict(torch.load(dir))

    def get_data(self):
        return self.data