def setWinDelta(self, winDelta): def setLoseDelta(self, loseDelta): def computeHyperparameters(self, numTakenActions, episodeNumber): if __name__ == '__main__': numOpponents = 1 numAgents = 2 MARLEnv = DiscreteMARLEnvironment(numOpponents = numOpponents, numAgents = numAgents) agents = [] for i in range(args.numAgents): agent = WolfPHCAgent(learningRate = 0.2, discountFactor = 0.99) agents.append(agent) numEpisodes = 4000 numTakenActions = 0 for episode in range(numEpisodes): status = ["IN_GAME","IN_GAME","IN_GAME"] observation = MARLEnv.reset() while status[0]=="IN_GAME": for agent in agents: loseDelta, winDelta, learningRate = agent.computeHyperparameters(numTakenActions, episode) agent.setLoseDelta(loseDelta) agent.setWinDelta(winDelta) agent.setLearningRate(learningRate) actions = [] perAgentObs = [] agentIdx = 0 for agent in agents: obsCopy = deepcopy(observation[agentIdx]) perAgentObs.append(obsCopy) agent.setState(agent.toStateRepresentation(obsCopy)) actions.append(agent.act()) agentIdx += 1 nextObservation, reward, done, status = MARLEnv.step(actions) numTakenActions += 1 agentIdx = 0 for agent in agents: agent.setExperience(agent.toStateRepresentation(perAgentObs[agentIdx]), actions[agentIdx], reward[agentIdx], status[agentIdx], agent.toStateRepresentation(nextObservation[agentIdx])) agent.learn() agent.calculateAveragePolicyUpdate() agent.calculatePolicyUpdate() agentIdx += 1 observation = nextObservation
# # def setLearningRate(self, learningRate): # raise NotImplementedError # # def computeHyperparameters(self, numTakenActions, episodeNumber): # raise NotImplementedError if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--numOpponents', type=int, default=1) parser.add_argument('--numAgents', type=int, default=2) parser.add_argument('--numEpisodes', type=int, default=100) args = parser.parse_args() MARLEnv = DiscreteMARLEnvironment(numOpponents=args.numOpponents, numAgents=args.numAgents) agents = [] for i in range(args.numAgents): agent = IndependentQLearningAgent(learningRate=0.1, discountFactor=0.9, epsilon=1.0) agents.append(agent) numEpisodes = args.numEpisodes numTakenActions = 0 for episode in range(numEpisodes): status = ["IN_GAME", "IN_GAME", "IN_GAME"] observation = MARLEnv.reset() totalReward = 0.0 timeSteps = 0
parser.add_argument('--numEpisodes', type=int, default=50000) parser.add_argument('--visualize', type=bool, default=False) args = parser.parse_args() ########### with debugging purposes only ############ #debug = True #if debug: #rewards_buffer = [] #history = [10,500] #goals = [0]*max(history) #configure("tb/IQL" + str(datetime.now())) ##################################################### MARLEnv = DiscreteMARLEnvironment(numOpponents=args.numOpponents, numAgents=args.numAgents, visualize=args.visualize) agents = [] for i in range(args.numAgents): agent = IndependentQLearningAgent(learningRate=0.99, discountFactor=0.9, epsilon=1.0) agents.append(agent) numEpisodes = args.numEpisodes numTakenActions = 0 for episode in range(numEpisodes): status = ["IN_GAME", "IN_GAME", "IN_GAME"] observation = MARLEnv.reset() totalReward = 0.0 timeSteps = 0
return self.loseDelta, self.winDelta, learningRate if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--numOpponents', type=int, default=1) parser.add_argument('--numAgents', type=int, default=2) parser.add_argument('--numEpisodes', type=int, default=50000) args = parser.parse_args() numOpponents = args.numOpponents numAgents = args.numAgents MARLEnv = DiscreteMARLEnvironment(numOpponents=numOpponents, numAgents=numAgents, visualize=False) agents = [] for i in range(args.numAgents): agent = WolfPHCAgent(learningRate=0.2, discountFactor=0.99, winDelta=0.01, loseDelta=0.1) agents.append(agent) statusHistory = [] numEpisodes = args.numEpisodes numTakenActions = 0 for episode in range(numEpisodes):