activations = np.array(data.get('activations_' + str(skill))) actions = (np.array(data.get('actions_' + str(skill))) - 1) termination = np.array(data.get('termination_' + str(skill))) print('Creating model...') qNetwork = QNetwork(100, 64, 5, -1) optimizer = torch.optim.lr_scheduler.StepLR(optimizer=torch.optim.Adam( params=qNetwork.parameters(), lr=0.0005), step_size=250, gamma=0.9999) maxQ = 1 iterations = 0 for _ in range(20000000): if (_ % 1000000 == 0 and _ > 0): testPredictions = qNetwork.predict( activations[int(math.ceil(activations.shape[0] * 0.8)) + 1:activations.shape[0], :]) trainPredictions = qNetwork.predict( activations[0:int(math.ceil(activations.shape[0] * 0.8)), :]) print('Done ' + _ + ' iterations. testing error is:...') print('Loss: ' + loss_val + ', Skill#: ' + skill) index = np.random.randint(int(math.ceil(activations.shape[0] * 0.8)), size=batchSize) allQ = qNetwork.predict(activations[index, :]) Q1 = qNetwork.predict(activations[index + 1, :]) targetQ = np.ones(allQ.shape) * -1 for i in range(index.shape[0]):
class DeepQ_agent: """ Represents the DQN agent. """ def __init__(self, env, hidden_units = None, network_LR=0.01, batch_size=1024, update_every=5, gamma=0.95): """ Creates a DQN agent. :param env: game environment. :type env: Class Snake_Env(). :param hidden_units: number of neurons in each layer. :type hidden_units: tupple with dimension (1, 3). :param network_LR: learning rate of the action-value neural network. :type network_LR: float. :param batch_size: size of the minibatch taken from the replay buffer. :type batch_size: int. :param update_every: number of iterations for updating the target qnetwork. :type update_every: int :param gamma: discount factor. :type gamma: float. """ self.env = env self.BATCH_SIZE = batch_size self.GAMMA = gamma self.NETWORK_LR = network_LR self.MEMORY_CAPACITY = int(1e5) self.ACTION_SIZE = env.ACTION_SPACE self.HIDDEN_UNITS = hidden_units self.UPDATE_EVERY = update_every self.qnetwork_local = QNetwork(input_shape = self.env.STATE_SPACE, hidden_units = self.HIDDEN_UNITS, output_size = self.ACTION_SIZE, learning_rate = self.NETWORK_LR) self.qnetwork_target = QNetwork(input_shape = self.env.STATE_SPACE, hidden_units = self.HIDDEN_UNITS, output_size = self.ACTION_SIZE, learning_rate = self.NETWORK_LR) self.memory = ReplayMemory(self.MEMORY_CAPACITY, self.BATCH_SIZE) #Temp variable self.t = 0 def learn(self): """ Learn from memorized experience. """ if self.memory.__len__() > self.BATCH_SIZE: states, actions, rewards, next_states, dones = self.memory.sample(self.env.STATE_SPACE) #Calculating action-values using local network target = self.qnetwork_local.predict(states, self.BATCH_SIZE) #Future action-values using target network target_val = self.qnetwork_target.predict(next_states, self.BATCH_SIZE) #Future action-values using local network target_next = self.qnetwork_local.predict(next_states, self.BATCH_SIZE) max_action_values = np.argmax(target_next, axis=1) #action selection for i in range(self.BATCH_SIZE): if dones[i]: target[i][actions[i]] = rewards[i] else: target[i][actions[i]] = rewards[i] + self.GAMMA*target_val[i][max_action_values[i]] #action evaluation self.qnetwork_local.train(states, target, batch_size = self.BATCH_SIZE) if self.t == self.UPDATE_EVERY: self.update_target_weights() self.t = 0 else: self.t += 1 def act(self, state, epsilon=0.0): """ Chooses an action using an epsilon-greedy policy. :param state: current state. :type state: NumPy array with dimension (1, 18). :param epsilon: epsilon used in epsilon-greedy policy. :type epsilon: float :return action: action chosen by the agent. :rtype: int """ state = state.reshape((1,)+state.shape) action_values = self.qnetwork_local.predict(state) #returns a vector of size = self.ACTION_SIZE if random() > epsilon: action = np.argmax(action_values) #choose best action - Exploitation else: action = randint(0, self.ACTION_SIZE-1) #choose random action - Exploration return action def add_experience(self, state, action, reward, next_state, done): """ Add experience to agent's memory. """ self.memory.add(state, action, reward, next_state, done) def update_target_weights(self): """ Updates values of the Target network. """ self.qnetwork_target.model.set_weights(self.qnetwork_local.model.get_weights())
class DeepQ_agent: def __init__(self, env, hidden_units=None, network_LR=0.001, batch_size=64, update_every=4, gamma=1.0): self.env = env self.BATCH_SIZE = batch_size self.GAMMA = gamma self.NETWORK_LR = network_LR self.MEMORY_CAPACITY = int(1e5) #this is pythonic self.nA = env.ACTION_SPACE #number of actions agent can perform self.HIDDEN_UNITS = hidden_units self.UPDATE_EVERY = update_every #let's give it some brains self.qnetwork_local = QNetwork(input_shape=self.env.STATE_SPACE, hidden_units=self.HIDDEN_UNITS, output_size=self.nA, learning_rate=self.NETWORK_LR) print(self.qnetwork_local.model.summary()) #I call the target network as the PC # Where our agent stores all the concrete and important stuff self.qnetwork_target = QNetwork(input_shape=self.env.STATE_SPACE, hidden_units=self.HIDDEN_UNITS, output_size=self.nA, learning_rate=self.NETWORK_LR) #and the memory of course self.memory = ReplayMemory(self.MEMORY_CAPACITY, self.BATCH_SIZE) #handy temp variable self.t = 0 #----------------------Learn from experience-----------------------------------# def learn(self): ''' hell yeah ''' if self.memory.__len__() > self.BATCH_SIZE: states, actions, rewards, next_states, dones = self.memory.sample( self.env.STATE_SPACE) #calculating action-values using local network target = self.qnetwork_local.predict(states, self.BATCH_SIZE) #future action-values using target network target_val = self.qnetwork_target.predict(next_states, self.BATCH_SIZE) #future action-values using local network target_next = self.qnetwork_local.predict(next_states, self.BATCH_SIZE) #The main point of Double DQN is selection of action from local network #while the update si from target network max_action_values = np.argmax(target_next, axis=1) #action selection for i in range(self.BATCH_SIZE): if dones[i]: target[i][actions[i]] = rewards[i] else: target[i][ actions[i]] = rewards[i] + self.GAMMA * target_val[i][ max_action_values[i]] #action evaluation self.qnetwork_local.train(states, target, batch_size=self.BATCH_SIZE) if self.t == self.UPDATE_EVERY: self.update_target_weights() self.t = 0 else: self.t += 1 #-----------------------Time to act-----------------------------------------------# def act(self, state, epsilon=0): #set to NO exploration by default state = state.reshape((1, ) + state.shape) action_values = self.qnetwork_local.predict( state) #returns a vector of size = self.nA if random.random() > epsilon: action = np.argmax( action_values) #choose best action - Exploitation else: action = random.randint(0, self.nA - 1) #choose random action - Exploration return action #-----------------------------Add experience to agent's memory------------------------# def add_experience(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) #----------------------Updates values of Target network----------------------------# def update_target_weights(self): #well now we are doing hard update, but we can do soft update also self.qnetwork_target.model.set_weights( self.qnetwork_local.model.get_weights()) #---------------------helpful save function-------------------------------------# def save(self, model_num, directory): self.qnetwork_local.model.save( f'{directory}/snake_dqn_{model_num}_{time.asctime()}.h5')