Пример #1
0
    def get_q_table(self,episodes=100):
        
        all_epochs = []
        all_penalties = []
        #print('before for loop')
        rewards_list = np.zeros(episodes)
        print("SARSA")

        for i in tqdm(range(1, episodes)):
            state = self.env.reset
            cum_rewards = 0
            epochs, penalties, reward = 0, 0, 0
            done = False #terminal state param

            while not done:

                state_index = get_index(state,self.env.observation_space)

                if random.uniform(0, 1) < self.epsilon:
                    action = np.random.choice(self.env.action_space) # Explore action space
                    action_index = np.where(self.env.action_space == action) 

                else:
                    action_index = np.argmax(self.q_table[state_index]) # Exploit learned values
                    action = self.env.action_space[action_index]

                next_state, reward, done = self.env.step(state,action)
 
                next_state_index = get_index(next_state,self.env.observation_space)

                next_action_index = np.argmax(self.q_table[next_state_index])
                next_action = self.env.action_space[next_action_index]
                #print(next_state_index,next_action_index)
                #print(state_index,action_index)
                delta = reward + self.gamma*self.q_table[next_state_index,next_action_index] - self.q_table[state_index,action_index]

                self.q_table[state_index,action_index] = self.q_table[state_index,action_index] + self.alpha * (delta)
                cum_rewards += reward
               
                
                state = next_state   #move to next state
                epochs += 1
                
            #if i % 1000 == 0:
            #    clear_output(wait=True)
            #    print(f"Episode: {i}")

            rewards_list[i] = cum_rewards

        #print('Training finished.')

        return self.q_table, rewards_list #, self.eligibility_trace
Пример #2
0
    def evaluate_agent(self,episodes=1):

        total_epochs, total_penalties = 0, 0
                
        for _ in range(episodes):
            rewards_list = []
            state = self.env.reset
            epochs, penalties, reward = 0, 0, 0            
            done = False
    
            while not done:
                state_index = get_index(state,self.env.observation_space)

                action_index = np.argmax(self.q_table[state_index])
                action = self.env.action_space[action_index]
                state, reward, done = self.env.step(state,action)
                rewards_list.append(reward)
                #print('state',state)
                if reward == 0:
                    penalties += 1
                     
                epochs += 1
                
            total_penalties += penalties
            total_epochs += epochs

        return rewards_list
Пример #3
0
    def evaluate_agent(self, test_episodes=100):

        total_epochs, total_penalties = 0, 0
        save_policies = np.zeros((test_episodes, self.env.timesteps - 1))

        for _ in range(test_episodes):
            state = self.env.reset
            epochs, penalties, reward = 0, 0, 0

            reward_list = []
            policy = []
            stock_list = []
            done = False

            while not done:
                state_index = get_index(state, self.env.observation_space)
                action_index = np.argmax(self.q_table[state_index])
                action = self.env.action_space[action_index]
                state, reward, done = self.env.step(state, action)
                reward_list.append(reward)
                policy.append(action)
                #print((state))
                if isinstance(state, int) == False:
                    stock_list.append(state[1])

                if reward == 0:
                    penalties += 1

                epochs += 1

            total_penalties += penalties
            total_epochs += epochs
            save_policies[_] = policy

        return reward_list, stock_list, policy
Пример #4
0
    def get_q_table(self, episodes=100001):

        rewards_list = np.zeros(episodes)
        print('Q-learning')
        for i in tqdm(range(1, episodes)):
            state = self.env.reset
            epochs, penalties, reward = 0, 0, 0
            cum_rewards = 0
            done = False
            while not done:
                state_index = get_index(state, self.env.observation_space)
                if random.uniform(0, 1) < self.epsilon:
                    action = np.random.choice(
                        self.env.action_space)  # Explore action space
                    action_index = np.where(self.env.action_space == action)
                else:
                    action_index = np.argmax(
                        self.q_table[state_index])  # Exploit learned values
                    action = self.env.action_space[action_index]

                next_state, reward, done = self.env.step(state, action)
                old_value = self.q_table[state_index, action_index]
                next_state_index = get_index(next_state,
                                             self.env.observation_space)
                next_max = np.max(self.q_table[next_state_index])
                new_value = (1 - self.alpha) * old_value + self.alpha * (
                    reward + self.gamma * next_max)
                self.q_table[state_index, action_index] = new_value
                cum_rewards += reward

                if reward == 0:
                    penalties += 1

                state = next_state
                epochs += 1
            rewards_list[i] = cum_rewards

        return self.q_table, rewards_list
Пример #5
0
    def get_q_table(self, episodes=100001):

        all_epochs = []
        all_penalties = []
        r_list = np.zeros(episodes)
        print('Monte Carlo')

        for i in tqdm(range(1, episodes)):
            state = self.env.reset
            state_list = []
            action_list = []
            reward_list = []

            cum_rewards = 0
            epochs, penalties, reward = 0, 0, 0
            done = False

            while not done:

                state_index = get_index(state, self.env.observation_space)

                if random.uniform(0, 1) < self.epsilon:
                    action = np.random.choice(
                        self.env.action_space)  # Explore action space
                    action_index = np.where(self.env.action_space == action)
                else:
                    action_index = np.argmax(
                        self.q_table[state_index])  # Exploit learned values
                    action = self.env.action_space[action_index]

                state_list.append(state)
                action_list.append(action)

                next_state, reward, done = self.env.step(state, action)

                reward_list.append(reward)

                if reward == 0:
                    penalties += 1
                cum_rewards += reward

                state = next_state
                epochs += 1

            for k in range(len(state_list)):
                state = state_list[k]
                action = action_list[k]
                action_index = np.where(self.env.action_space == action)
                state_index = get_index(state, self.env.observation_space)
                reward = reward_list[k:]
                discount = [self.gamma**j for j in range(len(reward))]
                discounted_reward = np.dot(reward, discount)

                self.q_table[state_index,
                             action_index] = (1 - self.alpha) * self.q_table[
                                 state_index,
                                 action_index] + self.alpha * discounted_reward

            #if i % 1000 == 0:
            #    clear_output(wait=True)
            #    print(f"Episode: {i}")

            r_list[i] = cum_rewards

        #print("Training finished.\n")

        return self.q_table, r_list