Exemplo n.º 1
0
    def train_v2(self, model_path, log_path):
        '''
        every time slot, it updates netowrk parameters like TD-error
        '''
        self.epi_rewards_mean, self.epi_durations, self.episode = [], [], []
        self.actor_losses, self.critic_losses = [], []
        self.step_count = []
        for epi in range(self.conf['num_episode']):  # episodes
            log_probs, values, rewards, masks = [], [], [], []
            entropy, epi_reward, epi_duration, step = 0, 0.0, 0.0, 0
            print("--- episode %s ---" % (epi))
            self.episode.append(epi)

            state = self.env.reset()  # [2*num_plant, 1]
            t = P.t_start
            while t < P.t_end:  # one episode (simulation)
                epi_duration = t
                if round(t,
                         3) * 1000 % 10 == 0:  # every 10 ms, schedule udpate
                    # action = self.agent.select_action(state_ts)     # action type: tensor [1X1]
                    state_ts = to_tensor(state).reshape(-1)  # [1]
                    dist, value = self.agent.actor(
                        state_ts), self.agent.critic(
                            state_ts)  # pi(s) and V(s)
                    action = dist.sample()  # scalar of a tensor
                    next_state, reward, done, info = self.env.step(
                        action.cpu().numpy().item(),
                        t)  # shape of next_state : [(2*num_plant) X 1]

                    actor_loss, critic_loss = self.agent.optimization_model_v2(
                        dist, action, state_ts,
                        to_tensor(next_state).reshape(-1), reward, done)

                    state = next_state
                    epi_reward += reward
                    step += 1
                    self.step_count.append(step)
                    rewards.append(reward)
                    self.actor_losses.append(actor_loss)
                    self.critic_losses.append(critic_loss)
                    if done:
                        break
                else:  # every 1 ms
                    self.env.update_plant_state(t)  # plant status update
                t = t + P.Ts
            # optimize
            print("epi_duration:", epi_duration)
            print("mean epi_reward:", epi_reward / len(rewards))
            # episode done
            self.epi_rewards_mean.append(epi_reward / len(rewards))
            self.epi_durations.append(epi_duration)

        # Save satet_dict
        torch.save(self.agent.actor.state_dict(), model_path)
        # torch.save(self.agent.critic.state_dict(), CRITIC_MODEL)
        self.save_log(log_path)
Exemplo n.º 2
0
    def train(self, model_path, log_path):
        self.epi_durations, self.epi_rewards_mean, self.episode, self.step_count, self.epi_loss = [], [], [], [], []

        for epi in range(self.conf['num_episode']):  # episodes
            rewards = []
            self.episode.append(epi)
            step = 0
            epi_reward = 0.0
            print("--- episode %s ---" % (epi))
            state = self.env.reset()
            state_ts = to_tensor(state).reshape(-1).unsqueeze(0)
            t = P.t_start
            while t < P.t_end:  # one episode (simulation)
                epi_duration = t
                if round(t,
                         3) * 1000 % 10 == 0:  # every 10 ms, schedule udpate
                    action = self.agent.select_action(state_ts)
                    next_state, reward, done, info = self.env.step(
                        action.item(),
                        t)  # shape of next_state : [(2*num_plant) X 1]
                    done_mask = 0.0 if done else 1.0

                    next_state_ts = to_tensor(next_state).reshape(
                        -1).unsqueeze(0)
                    reward_ts = to_tensor(np.asarray(reward).reshape(-1))
                    self.agent.memory.push_transition(state_ts, action,
                                                      next_state_ts, reward_ts)
                    state_ts = next_state_ts
                    epi_reward += reward
                    step += 1
                    self.step_count.append(step)
                    rewards.append(reward)
                    if self.agent.memory.length(
                    ) >= self.conf['memory_capacity']:
                        loss = self.agent.update()
                        self.epi_loss.append(loss)
                    if done:
                        break
                else:  # every 1 ms
                    self.env.update_plant_state(t)  # plant status update
                t = t + P.Ts

            if epi % self.conf['target_update'] == 0 and epi != 0:
                print("target update")
                self.agent.q_target.load_state_dict(self.agent.q.state_dict())
            self.epi_durations.append(epi_duration)
            self.epi_rewards_mean.append(epi_reward)
            print("epi_duration:", epi_duration)
            # print("epi_reward:%s, len:%s"%(epi_reward, len(rewards)))
            print("mean epi_reward:", epi_reward / len(rewards))
            if epi % 10 == 0 and epi != 0:
                torch.save(self.agent.q.state_dict(), model_path)
                self.save_log(log_path)
        torch.save(self.agent.q.state_dict(), model_path)
        self.save_log(log_path)
Exemplo n.º 3
0
    def test(self, env, model_path):
        test_actions = []
        # new agent
        new_agent = DQN(self.conf, self.device)
        new_agent.load_model(model_path)

        epi_reward = 0.0
        epi_duration = 0.0
        state = env.reset()  # [2*num_plant, 1]
        state_ts = to_tensor(state).unsqueeze(
            0
        )  # [1, 2*num_plant, 1] # unsqueeze(0) on 'state' is necessary for reply memory
        # realtimePlot = testDataPlotter(self.conf)
        t = P.t_start
        while t < P.t_end:
            t_next_plot = t + P.t_plot
            epi_duration = t
            while t < t_next_plot:  # data plot period
                if round(t,
                         3) * 1000 % 10 == 0:  # every 10 ms, schedule udpate
                    action = new_agent.select_action(
                        state_ts)  # action type: tensor [1X1]
                    next_state, reward, done, info = env.step(
                        action.item(),
                        t)  # shape of next_state : [(2*num_plant) X 1]
                    epi_reward += reward
                    test_actions.append(action.item())
                    if done: break
                else:  # every 1 ms
                    env.update_plant_state(t)  # plant status update
                t = t + P.Ts
            # self.update_dataPlot(realtimePlot, t, env) # update data plot
            if done: break
Exemplo n.º 4
0
    def optimization_model(self, next_state, rewards, log_probs, values, masks):
        '''
        next_state : episode last state, which is used for G_t (return)
        rewards : a list that includes all rewards during an episode
        log_probs : a list that includes all log pi(a_t|s_t) during an episode, relative to actor network
        values : a list that includes all V(s_t), relative to critic network
        '''
        next_state_ts = to_tensor(next_state).reshape(-1) # [5*num_plant]
        next_value = self.critic(next_state_ts)     # V(s_{t+1}) 전체 분포?
        returns = self.compute_returns(next_value, rewards, masks)   # G_t = R + gamma * G_{t+1}
        
        log_probs = torch.cat(log_probs)
        returns = torch.cat(returns).detach()
        values = torch.cat(values)

        advantage = returns - values        # A = G_t - V(s_t)
        
        actor_loss = -(log_probs * advantage.detach()).mean() # -(log_pi(a|s) * A) 의 평균
        critic_loss = advantage.pow(2).mean()   # A^2 의 평균?
        
        self.optimizerA.zero_grad()
        self.optimizerC.zero_grad()
        actor_loss.backward()
        critic_loss.backward()
        self.optimizerA.step()
        self.optimizerC.step()
        
        return actor_loss, critic_loss
Exemplo n.º 5
0
    def scheduler(self, state):
        '''
        input: states for all plants
        output: current scheduled plant
        '''
        state_ts = to_tensor(state, is_cuda=False,
                             device=self.device).reshape(-1)
        # print("state_ts:", state_ts)
        dist = self.agent.actor(state_ts)
        action = dist.sample()

        return action
Exemplo n.º 6
0
    def select_action(self, state, noise_enable=True, decay_epsilon=True):
        action, _ = self.actor(
            to_tensor(state).reshape(-1).unsqueeze(0)
        )  # input shape = [batch(=1) X state_dim], action : type (tuple), shape [batch X action_dim]
        action = action.cpu().detach().numpy().squeeze(
            0)  # action shape [action_dim,]
        if noise_enable == True:
            action += self.is_training * max(self.epsilon,
                                             0) * self.random_process.sample()
        action = np.clip(action, 0.,
                         1.)  # input 중 -1~1 을 벗어나는 값에 대해 -1 or 1 로 대체
        if decay_epsilon:
            self.epsilon -= self.depsilon

        return action
Exemplo n.º 7
0
    def action_and_envStep(self, agent, env, t, state, algorithm):
        if algorithm == 'A2C' or algorithm == 'random':
            state_ts = to_tensor(state).reshape(-1)
            dist = agent.actor(state_ts)
            action = dist.sample()
            # schedule = env.action_to_schedule_v2(action.cpu().numpy(), self.conf['action_dim'])
            next_state, reward, done, info = env.step(
                action.cpu().numpy().item(), t)
        elif algorithm == 'sequence':
            action = agent.select_seqAction()
            # schedule = env.action_to_schedule_v2(action.cpu().numpy(), self.conf['action_dim'])
            next_state, reward, done, info = env.step(
                action.cpu().numpy().item(), t)
            if done == True: print("done true")

        return action, next_state, reward, done, info
Exemplo n.º 8
0
    def update_policy(self, memory, gamma=0.99):
        print("updating...")
        # Sample batch
        experiences = memory.sample(
            self.conf['batch_size']
        )  # type: list | shape: (max_epi_length(2000)-1 X batch(32) X 5(??))
        if len(experiences) == 0:  # not enough samples
            return
        dtype = torch.cuda.FloatTensor

        policy_loss_total = 0
        value_loss_total = 0

        for t in range(len(experiences) - 1):  # iterate over episodes
            # print("t:", t)
            target_cx = Variable(torch.zeros(self.conf['batch_size'],
                                             50)).type(dtype)
            target_hx = Variable(torch.zeros(self.conf['batch_size'],
                                             50)).type(dtype)

            cx = Variable(torch.zeros(self.conf['batch_size'], 50)).type(dtype)
            hx = Variable(torch.zeros(self.conf['batch_size'], 50)).type(dtype)

            # we first get the data out of the sampled experience
            # shape of state0, action, reward: [batch X state_dim], [batch X 1], [batch X 1]
            state0 = np.stack([
                trajectory.state0 for trajectory in experiences[t]
            ])  # batch 개수만큼 각 epi 중 t 시점에서 상태만 추출
            # action = np.expand_dims(np.stack((trajectory.action for trajectory in experiences[t])), axis=1)
            action = np.stack(
                [trajectory.action for trajectory in experiences[t]])
            reward = np.expand_dims(np.stack(
                [trajectory.reward for trajectory in experiences[t]]),
                                    axis=1)
            # reward = np.stack((trajectory.reward for trajectory in experiences[t]))
            state1 = np.stack(
                [trajectory.state0 for trajectory in experiences[t + 1]])

            target_action, (target_hx, target_cx) = self.actor_target(
                to_tensor(state1).reshape(self.conf['batch_size'], -1),
                (target_hx, target_cx))
            next_q_value = self.critic_target([
                to_tensor(state1).reshape(self.conf['batch_size'], -1),
                target_action
            ])

            target_q = to_tensor(reward) + gamma * next_q_value

            # Critic update
            current_q = self.critic([
                to_tensor(state0).reshape(self.conf['batch_size'], -1),
                to_tensor(action)
            ])

            value_loss = F.smooth_l1_loss(current_q, target_q)
            value_loss /= len(experiences)  # divide by trajectory length
            value_loss_total += value_loss
            # update per trajectory
            self.critic.zero_grad()
            value_loss.backward()

            # Actor update
            action, (hx, cx) = self.actor(
                to_tensor(state0).reshape(self.conf['batch_size'], -1),
                (hx, cx))
            policy_loss = -self.critic([
                to_tensor(state0).reshape(self.conf['batch_size'], -1), action
            ])
            policy_loss /= len(experiences)  # divide by trajectory length
            policy_loss_total += policy_loss.mean()
            policy_loss = policy_loss.mean()
            self.actor.zero_grad()
            policy_loss.backward()

            self.critic_optim.step()
            self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        print("update finish!")
Exemplo n.º 9
0
    def train(self, model_path, log_path):
        self.epi_rewards_mean, self.epi_durations, self.episode = [], [], []
        self.actor_losses, self.critic_losses = [], []
        self.step_count = []
        for epi in range(self.conf['num_episode']):  # episodes
            log_probs, values, rewards, masks = [], [], [], []
            entropy, epi_reward, epi_duration, step = 0, 0.0, 0.0, 0
            print("--- episode %s ---" % (epi))
            self.episode.append(epi)

            state = self.env.reset()  # [2*num_plant, 1]
            t = P.t_start
            while t < P.t_end:  # one episode (simulation)
                epi_duration = t
                if round(t,
                         3) * 1000 % 10 == 0:  # every 10 ms, schedule udpate
                    # action = self.agent.select_action(state_ts)     # action type: tensor [1X1]
                    state_ts = to_tensor(state).reshape(-1)  # [1]
                    dist, value = self.agent.actor(
                        state_ts), self.agent.critic(
                            state_ts)  # pi(s) and V(s)
                    action = dist.sample()  # scalar of a tensor
                    next_state, reward, done, info = self.env.step(
                        action.cpu().numpy().item(),
                        t)  # shape of next_state : [(2*num_plant) X 1]

                    log_prob = dist.log_prob(action).unsqueeze(
                        0)  # [1] : log pi(a_t|s_t)
                    entropy += dist.entropy().mean()

                    log_probs.append(log_prob)
                    values.append(value)
                    rewards.append(
                        torch.tensor([reward],
                                     dtype=torch.float,
                                     device=self.device))
                    masks.append(
                        torch.tensor([1 - done],
                                     dtype=torch.float,
                                     device=self.device))
                    # print("step reward: ", reward)
                    state = next_state
                    epi_reward += reward
                    step += 1
                    self.step_count.append(step)
                    if done:
                        break
                else:  # every 1 ms
                    self.env.update_plant_state(t)  # plant status update
                t = t + P.Ts
            # optimize - monte-carlo
            actor_loss, critic_loss = self.agent.optimization_model(
                next_state, rewards, log_probs, values, masks)
            print("epi_duration:", epi_duration)
            print("mean epi_reward:", epi_reward / len(rewards))
            # episode done
            self.epi_rewards_mean.append(epi_reward / len(rewards))
            self.epi_durations.append(epi_duration)
            self.actor_losses.append(actor_loss.item())
            self.critic_losses.append(critic_loss.item())
            if epi % 10 == 0:
                torch.save(self.agent.actor.state_dict(), model_path)
                self.save_log(log_path)
        # Save satet_dict
        torch.save(self.agent.actor.state_dict(), model_path)
        self.save_log(log_path)