Exemplo n.º 1
0
        activations = np.array(data.get('activations_' + str(skill)))
        actions = (np.array(data.get('actions_' + str(skill))) - 1)
        termination = np.array(data.get('termination_' + str(skill)))

        print('Creating model...')
        qNetwork = QNetwork(100, 64, 5, -1)
        optimizer = torch.optim.lr_scheduler.StepLR(optimizer=torch.optim.Adam(
            params=qNetwork.parameters(), lr=0.0005),
                                                    step_size=250,
                                                    gamma=0.9999)
        maxQ = 1
        iterations = 0
        for _ in range(20000000):
            if (_ % 1000000 == 0 and _ > 0):
                testPredictions = qNetwork.predict(
                    activations[int(math.ceil(activations.shape[0] * 0.8)) +
                                1:activations.shape[0], :])
                trainPredictions = qNetwork.predict(
                    activations[0:int(math.ceil(activations.shape[0] *
                                                0.8)), :])
                print('Done ' + _ + ' iterations. testing error is:...')
            print('Loss: ' + loss_val + ', Skill#: ' + skill)
        index = np.random.randint(int(math.ceil(activations.shape[0] * 0.8)),
                                  size=batchSize)

        allQ = qNetwork.predict(activations[index, :])
        Q1 = qNetwork.predict(activations[index + 1, :])

        targetQ = np.ones(allQ.shape) * -1

        for i in range(index.shape[0]):
Exemplo n.º 2
0
class DeepQ_agent:
    """
    Represents the DQN agent.
    """
    def __init__(self, env, hidden_units = None, network_LR=0.01, batch_size=1024, update_every=5, gamma=0.95):
        """
        Creates a DQN agent.

        :param env: game environment.
        :type env: Class Snake_Env().
        :param hidden_units: number of neurons in each layer.
        :type hidden_units: tupple with dimension (1, 3).
        :param network_LR: learning rate of the action-value neural network.
        :type network_LR: float.
        :param batch_size: size of the minibatch taken from the replay buffer.
        :type batch_size: int.
        :param update_every: number of iterations for updating the target qnetwork. 
        :type update_every: int
        :param gamma: discount factor.
        :type gamma: float.
        """
        self.env = env
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma          
        self.NETWORK_LR = network_LR
        self.MEMORY_CAPACITY = int(1e5)   
        self.ACTION_SIZE = env.ACTION_SPACE           
        self.HIDDEN_UNITS = hidden_units
        self.UPDATE_EVERY = update_every
       
        self.qnetwork_local = QNetwork(input_shape = self.env.STATE_SPACE,
                                        hidden_units = self.HIDDEN_UNITS,
                                        output_size = self.ACTION_SIZE,
                                        learning_rate = self.NETWORK_LR)
        
        self.qnetwork_target = QNetwork(input_shape = self.env.STATE_SPACE,
                                        hidden_units = self.HIDDEN_UNITS,
                                        output_size = self.ACTION_SIZE,
                                        learning_rate = self.NETWORK_LR)

        self.memory = ReplayMemory(self.MEMORY_CAPACITY, self.BATCH_SIZE) 

        #Temp variable
        self.t = 0


    def learn(self):
        """
        Learn from memorized experience.
        """
        if self.memory.__len__() > self.BATCH_SIZE:
            states, actions, rewards, next_states, dones = self.memory.sample(self.env.STATE_SPACE)
            
            #Calculating action-values using local network
            target = self.qnetwork_local.predict(states, self.BATCH_SIZE)
            
            #Future action-values using target network
            target_val = self.qnetwork_target.predict(next_states, self.BATCH_SIZE)
            
            #Future action-values using local network
            target_next = self.qnetwork_local.predict(next_states, self.BATCH_SIZE)
        
            max_action_values = np.argmax(target_next, axis=1)   #action selection
            
            for i in range(self.BATCH_SIZE):
                if dones[i]:
                    target[i][actions[i]] = rewards[i]
                else:
                    target[i][actions[i]] = rewards[i] + self.GAMMA*target_val[i][max_action_values[i]]   #action evaluation
            
            self.qnetwork_local.train(states, target, batch_size = self.BATCH_SIZE)

            if self.t == self.UPDATE_EVERY:
                self.update_target_weights()
                self.t = 0
            else:
                self.t += 1


    def act(self, state, epsilon=0.0):
        """
        Chooses an action using an epsilon-greedy policy.
        
        :param state: current state.
        :type state: NumPy array with dimension (1, 18).
        :param epsilon: epsilon used in epsilon-greedy policy.
        :type epsilon: float
        :return action: action chosen by the agent.
        :rtype: int
        """    
        state = state.reshape((1,)+state.shape)
        action_values = self.qnetwork_local.predict(state)    #returns a vector of size = self.ACTION_SIZE
        if random() > epsilon:
            action = np.argmax(action_values)                 #choose best action - Exploitation
        else:
            action = randint(0, self.ACTION_SIZE-1)           #choose random action - Exploration
        return action


    def add_experience(self, state, action, reward, next_state, done):
        """
        Add experience to agent's memory.
        """
        self.memory.add(state, action, reward, next_state, done)

    
    def update_target_weights(self):
        """
        Updates values of the Target network.
        """
        self.qnetwork_target.model.set_weights(self.qnetwork_local.model.get_weights())
class DeepQ_agent:
    def __init__(self,
                 env,
                 hidden_units=None,
                 network_LR=0.001,
                 batch_size=64,
                 update_every=4,
                 gamma=1.0):
        self.env = env
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma
        self.NETWORK_LR = network_LR
        self.MEMORY_CAPACITY = int(1e5)  #this is pythonic

        self.nA = env.ACTION_SPACE  #number of actions agent can perform
        self.HIDDEN_UNITS = hidden_units
        self.UPDATE_EVERY = update_every

        #let's give it some brains
        self.qnetwork_local = QNetwork(input_shape=self.env.STATE_SPACE,
                                       hidden_units=self.HIDDEN_UNITS,
                                       output_size=self.nA,
                                       learning_rate=self.NETWORK_LR)
        print(self.qnetwork_local.model.summary())

        #I call the target network as the PC
        # Where our agent stores all the concrete and important stuff
        self.qnetwork_target = QNetwork(input_shape=self.env.STATE_SPACE,
                                        hidden_units=self.HIDDEN_UNITS,
                                        output_size=self.nA,
                                        learning_rate=self.NETWORK_LR)

        #and the memory of course
        self.memory = ReplayMemory(self.MEMORY_CAPACITY, self.BATCH_SIZE)

        #handy temp variable
        self.t = 0

#----------------------Learn from experience-----------------------------------#

    def learn(self):
        '''
            hell yeah   
        '''

        if self.memory.__len__() > self.BATCH_SIZE:
            states, actions, rewards, next_states, dones = self.memory.sample(
                self.env.STATE_SPACE)

            #calculating action-values using local network
            target = self.qnetwork_local.predict(states, self.BATCH_SIZE)

            #future action-values using target network
            target_val = self.qnetwork_target.predict(next_states,
                                                      self.BATCH_SIZE)

            #future action-values using local network
            target_next = self.qnetwork_local.predict(next_states,
                                                      self.BATCH_SIZE)

            #The main point of Double DQN is selection of action from local network
            #while the update si from target network
            max_action_values = np.argmax(target_next,
                                          axis=1)  #action selection

            for i in range(self.BATCH_SIZE):
                if dones[i]:
                    target[i][actions[i]] = rewards[i]
                else:
                    target[i][
                        actions[i]] = rewards[i] + self.GAMMA * target_val[i][
                            max_action_values[i]]  #action evaluation

            self.qnetwork_local.train(states,
                                      target,
                                      batch_size=self.BATCH_SIZE)

            if self.t == self.UPDATE_EVERY:
                self.update_target_weights()
                self.t = 0
            else:
                self.t += 1

#-----------------------Time to act-----------------------------------------------#

    def act(self, state, epsilon=0):  #set to NO exploration by default
        state = state.reshape((1, ) + state.shape)
        action_values = self.qnetwork_local.predict(
            state)  #returns a vector of size = self.nA
        if random.random() > epsilon:
            action = np.argmax(
                action_values)  #choose best action - Exploitation
        else:
            action = random.randint(0, self.nA -
                                    1)  #choose random action - Exploration

        return action

#-----------------------------Add experience to agent's memory------------------------#

    def add_experience(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

#----------------------Updates values of Target network----------------------------#

    def update_target_weights(self):
        #well now we are doing hard update, but we can do soft update also
        self.qnetwork_target.model.set_weights(
            self.qnetwork_local.model.get_weights())

#---------------------helpful save function-------------------------------------#

    def save(self, model_num, directory):
        self.qnetwork_local.model.save(
            f'{directory}/snake_dqn_{model_num}_{time.asctime()}.h5')