def __init__(self): super().__init__() self.util = Util() self.net = FullyConnected(128, 10) self.net.float() if torch.cuda.is_available(): self.device = torch.device('cuda') self.net.cuda() #self.net.load_state_dict(torch.load('../res/models/transfer.pth')) else: self.device = torch.device('cpu') #self.net.load_state_dict(torch.load('../res/models/transfer.pth', map_location=('cpu'))) print('device: ' + str(self.device) + '\n') # Create a gym environment (game environment) self.env = gym.make('Breakout-ram-v0') self.env.frameskip = 0 self.explorationRate = 0.05 self.criterion = nn.MSELoss() self.optimizer = optim.Adam(self.net.parameters()) self.replayMemory = [] self.explorationRate = 1.0 self.decayRate = 0.001
def __init__(self, load=True, save=True, saveFile=''): self.load = load self.save = save # object for utiliy functions self.util = Util() # load correct network weights if saveFile == '': self.saveFile = '../res/models/supervised.pth' else: self.saveFile = '../res/models/' + saveFile + '.pth' # Create network and load weights self.net = FullyConnected(128, 10) self.net = self.net.float() if self.load: self.net.load_state_dict(torch.load(self.saveFile)) # Load training and testing data self.trainingData = self.util.loadTrainingData('../res/training data/evolvedObservation.txt') self.testingData = self.util.loadTestingData('../res/training data/evolvedAction.txt') # Training parameters self.criterion = nn.CrossEntropyLoss() self.optimizer = optim.Adam(self.net.parameters())
def __init__(self, agent='supervised', render=False, debug=True, record=False): self.util = Util() # Create a gym environment (game environment) self.env = gym.make('Breakout-ram-v0') self.env.frameskip = 0 # Create an agent for the simulation if agent == 'supervised': self.agent = SupervisedAgent() elif agent == 'evolvedreinforced': self.agent = EvolvedReinforcedAgent() else: self.agent = TrivialAgent() # Initialise other variables self.record = record self.render = render self.debug = debug self.agentType = type(self.agent).__name__
def __init__(self, save=False, elite=False, selfLearn=False): super().__init__() self.util = Util() self.net = FullyConnected(128, 10) self.net.float() self.save = save if elite: print('elite') self.ramPath = '../res/training data/ram100.txt' self.actionPath = '../res/training data/action100.txt' else: self.ramPath = '../res/training data/ram.txt' self.actionPath = '../res/training data/action.txt' if selfLearn: self.weightPath = '../res/models/transferSelfLearn.pth' else: self.weightPath = '../res/models/transfer.pth' if torch.cuda.is_available(): self.device = torch.device('cuda') self.net.cuda() self.net.load_state_dict(torch.load(self.weightPath)) else: self.device = torch.device('cpu') self.net.load_state_dict( torch.load(self.weightPath, map_location=('cpu'))) print('device: ' + str(self.device) + '\n') # Create a gym environment (game environment) self.env = gym.make('Breakout-ram-v0') self.env.frameskip = 0 self.trainingData = self.util.loadTrainingData(self.ramPath) self.testingData = self.util.loadTestingData(self.actionPath) self.criterion = nn.CrossEntropyLoss() self.optimizer = optim.Adam(self.net.parameters()) # Create supervised agent self.supervisedAgent = SupervisedAgent()
class DQNAgent(): def __init__(self): super().__init__() self.util = Util() self.net = FullyConnected(128, 10) self.net.float() if torch.cuda.is_available(): self.device = torch.device('cuda') self.net.cuda() #self.net.load_state_dict(torch.load('../res/models/transfer.pth')) else: self.device = torch.device('cpu') #self.net.load_state_dict(torch.load('../res/models/transfer.pth', map_location=('cpu'))) print('device: ' + str(self.device) + '\n') # Create a gym environment (game environment) self.env = gym.make('Breakout-ram-v0') self.env.frameskip = 0 self.explorationRate = 0.05 self.criterion = nn.MSELoss() self.optimizer = optim.Adam(self.net.parameters()) self.replayMemory = [] self.explorationRate = 1.0 self.decayRate = 0.001 def getReplayMemory(self, numGames): for i in range(0, numGames): observation = self.env.reset() t = 0 while True: #self.env.render() ram = self.util.observationToTensor(observation) ballY = int(observation[101]) if random.uniform(0, 1) < self.explorationRate: action = random.randrange(4) elif t == 0 or ballY > 200 or ballY == 0: action = 1 else: action = self.util.tensorAction(self.net, ram) newObservation, reward, done, info = self.env.step(action) self.replayMemory.append( (self.util.observationToTensor(observation), action, reward, self.util.observationToTensor(newObservation), done)) observation = newObservation t += 1 self.explorationRate -= self.explorationRate * self.decayRate if done: break def train(self): self.getReplayMemory(5) for replay in self.replayMemory: observation = replay[0] action = replay[1] reward = replay[2] nextObservation = replay[3] done = replay[4] output = self.util.getOutput(self.net, observation) q_value = torch.max(output) target = reward + 0.5 * self.util.tensorAction( self.net, nextObservation) self.optimizer.zero_grad() loss = self.criterion(q_value, torch.tensor(target)) loss.backward() self.optimizer.step()
class BasicReinforcedAgent(): def __init__(self): super().__init__() self.util = Util() self.net = FullyConnected(128, 10) self.net.float() if torch.cuda.is_available(): self.device = torch.device('cuda') self.net.cuda() self.net.load_state_dict(torch.load('../res/models/transfer.pth')) else: self.device = torch.device('cpu') self.net.load_state_dict( torch.load('../res/models/transfer.pth', map_location=('cpu'))) print('device: ' + str(self.device) + '\n') # Create a gym environment (game environment) self.env = gym.make('Breakout-ram-v0') self.env.frameskip = 0 self.explorationRate = 0.05 self.optimizer = optim.Adam(self.net.parameters()) def calculateLoss(self, sampleReward, sampleLength): if sampleReward < 0: denominator = (0.6 * sampleReward) + (sampleLength + 1) else: denominator = (sampleReward * (sampleLength * 2) + sampleLength) * 50 if denominator == 0: return 0.0 return (sampleLength / denominator) # One game played by the agent def gameCycle(self, observation, t, iteration_reward): sampleLength = 100 sampleCounter = 0 sampleReward = 0 deathPenalty = 50 lives = 5 while True: #self.env.render() paddleMid = int(observation[72]) + 8 ballMid = int(observation[99]) + 1 ballY = int(observation[101]) ram = self.util.observationToTensor(observation) output = self.util.getScaledOutput(self.net, ram) if t == 0 or ballY > 200 or ballY <= 0: action = 1 #config.ACTION_FIRE else: action = self.util.tensorAction(self.net, ram) observation, reward, done, info = self.env.step(action) # If agent dies incur a large penalty if lives > info['ale.lives']: sampleReward -= deathPenalty lives -= 1 #if ballY >= 175: # self.processLoss(paddleMid, ballMid, output) iteration_reward += reward sampleReward += reward t += 1 sampleCounter += 1 if sampleCounter == sampleLength: targetLoss = self.calculateLoss(sampleReward, sampleLength) #print('reward: ' + str(sampleReward) + ' loss: ' + str(targetLoss)) self.processLoss(output, targetLoss) sampleCounter = 0 sampleReward = 0 if done: return iteration_reward, t # Calculate and apply loss def processLoss(self, output, targetLoss): self.optimizer.zero_grad() loss = self.util.applyLoss(output, targetLoss) loss.backward() self.optimizer.step() # At the end of training (or after fixed number of cycles) print statistics and save model def processTraining(self, totalReward, iterations, maxReward, elapsed_time): minutes = math.floor(elapsed_time / 60) seconds = math.floor(elapsed_time - minutes * 60) average_reward = totalReward / iterations print('average reward: ' + str(average_reward)) print('max reward: ' + str(maxReward)) print('Training took: ' + str(minutes) + ':' + str(seconds)) print() #file = open('../res/learning_progress2.txt', 'a+') #file.write(str(average_reward) + ' ' + str(maxReward) + '\n') torch.save(self.net.state_dict(), '../res/models/evolved2.pth') maxReward = 0 total = 0 # Train the agent def train(self, iterations): maxReward = 0 totalReward = 0 startTime = time.time() # How many iterations of the game should be played for i_episode in range(iterations): observation = self.env.reset() iteration_reward = 0 t = 0 # One complete game iteration_reward, t = self.gameCycle(observation, t, iteration_reward) print(iteration_reward) totalReward += iteration_reward if iteration_reward > maxReward: maxReward = iteration_reward if i_episode != 0 and i_episode % 1000 == 0: # Calcuate how long this training took elapsed_time = time.time() - startTime self.processTraining(totalReward, iterations, maxReward, elapsed_time) startTime = time.time() # One all games finished process training again elapsed_time = time.time() - startTime self.processTraining(totalReward, iterations, maxReward, elapsed_time) startTime = time.time()
class TransferAgent(): def __init__(self, save=False, elite=False, selfLearn=False): super().__init__() self.util = Util() self.net = FullyConnected(128, 10) self.net.float() self.save = save if elite: print('elite') self.ramPath = '../res/training data/ram100.txt' self.actionPath = '../res/training data/action100.txt' else: self.ramPath = '../res/training data/ram.txt' self.actionPath = '../res/training data/action.txt' if selfLearn: self.weightPath = '../res/models/transferSelfLearn.pth' else: self.weightPath = '../res/models/transfer.pth' if torch.cuda.is_available(): self.device = torch.device('cuda') self.net.cuda() self.net.load_state_dict(torch.load(self.weightPath)) else: self.device = torch.device('cpu') self.net.load_state_dict( torch.load(self.weightPath, map_location=('cpu'))) print('device: ' + str(self.device) + '\n') # Create a gym environment (game environment) self.env = gym.make('Breakout-ram-v0') self.env.frameskip = 0 self.trainingData = self.util.loadTrainingData(self.ramPath) self.testingData = self.util.loadTestingData(self.actionPath) self.criterion = nn.CrossEntropyLoss() self.optimizer = optim.Adam(self.net.parameters()) # Create supervised agent self.supervisedAgent = SupervisedAgent() # Get the action from a trained supervised agent def getSupervisedAgentAction(self, observation): action = self.supervisedAgent.action(observation) return action # Get the action from raw observation def observationAction(self, observation): scaled_RAM = self.util.scaleRAM(observation) output = self.net(scaled_RAM) maxVal = torch.max(output, 0) return int(maxVal[1]) # Transfor tensor to have values [0,1] def normaliseTensor(self, tensor): minValue = torch.min(tensor) print('min: ' + str(minValue)) maxValue = torch.max(tensor) print('max: ' + str(maxValue)) normalised = [] for value in tensor: normalised.append( (value.item() - minValue) / (maxValue - minValue)) return torch.tensor(normalised, device=self.device) # Based on where the ball landed in relation to the paddle, find the target def getTarget(self, paddleMid, ballMid): arr = [] # If the ball hits the paddle, remain in the same place if abs(paddleMid - ballMid) < 7: arr = [1.0, 0.0, 0.0, 0.0] # paddle left of ball so move right if paddleMid < ballMid: arr = [0.0, 0.0, 1.0, 0.0] # paddle right of ball so move left elif paddleMid > ballMid: arr = [0.0, 0.0, 0.0, 1.0] return torch.tensor(arr, device=self.device) def supervisedLearn(self, iterations): for iteration in range(0, iterations): running_loss = 0.0 for i in range(0, len(self.trainingData)): # zero the parameter gradients self.optimizer.zero_grad() # Scale the RAM values to be between 0-1 scaled_RAM = self.util.scaleRAM(self.trainingData[i]) # Get the outputs and target outputs = self.net(scaled_RAM) target = self.testingData[i].clone() # Unsqueeze outputs for loss function outputs = outputs.unsqueeze(dim=0) loss = self.criterion(outputs, target) loss.backward() self.optimizer.step() running_loss += loss.item() # Every 2000 mini-batches print the loss and save the model if i % 2000 == 1999: print('[%d, %5d] loss: %.3f' % (iteration + 1, i + 1, running_loss / 2000)) file = open('../res/transferProgress.txt', 'a+') file.write('[%d, %5d] loss: %.3f' % (iteration + 1, i + 1, running_loss / 2000) + '\n') running_loss = 0.0 if self.save: torch.save(self.net.state_dict(), self.weightPath) def selfLearn(self, iterations): # How many iterations of the game should be played for i_episode in range(iterations): observation = self.env.reset() iteration_reward = 0 t = 0 # One game iteration while True: # self.env.render() paddleMid = int(observation[72]) + 8 ballMid = int(observation[99]) + 1 ballY = int(observation[101]) # Get scaled ram tensor for input to network scaled_RAM = self.util.scaleRAM(observation) if t == 0 or ballY > 200 or ballY <= 0: action = 1 # config.ACTION_FIRE else: action = self.util.tensorAction(scaled_RAM) observation, reward, done, info = self.env.step(action) if ballY <= 25: self.optimizer.zero_grad() output = self.util.getOutput(scaled_RAM) output = F.softmax(output, dim=0) target = self.getTarget(paddleMid, ballMid) criterion = nn.MSELoss() loss = criterion(output, target) loss.backward() self.optimizer.step() iteration_reward += reward t += 1 if done: print(iteration_reward) file = open('../res/transferProgressSelfLearn.txt', 'a+') file.write(str(iteration_reward) + '\n') if self.save: torch.save(self.net.state_dict(), '../res/models/transferSelfLearn.pth') break
class SupervisedAgent(): """ Supervised Learning agent Trained using training and testing data generated from expert system """ def __init__(self, load=True, save=True, saveFile=''): self.load = load self.save = save # object for utiliy functions self.util = Util() # load correct network weights if saveFile == '': self.saveFile = '../res/models/supervised.pth' else: self.saveFile = '../res/models/' + saveFile + '.pth' # Create network and load weights self.net = FullyConnected(128, 10) self.net = self.net.float() if self.load: self.net.load_state_dict(torch.load(self.saveFile)) # Load training and testing data self.trainingData = self.util.loadTrainingData('../res/training data/evolvedObservation.txt') self.testingData = self.util.loadTestingData('../res/training data/evolvedAction.txt') # Training parameters self.criterion = nn.CrossEntropyLoss() self.optimizer = optim.Adam(self.net.parameters()) def train(self, epochs): """ Train the agent for a given number of epochs epochs : int = number of training epochs """ for epoch in range(0, epochs): running_loss = 0.0 for i in range(0, len(self.trainingData)): self.optimizer.zero_grad() # Get network output and target tensorInput = self.trainingData[i] outputs = self.net(tensorInput.float()) outputs = outputs.unsqueeze(dim=0) target = self.testingData[i].clone() # Calcuate loss between network output and target from testing dataset loss = self.criterion(outputs, target) loss.backward() self.optimizer.step() # Calculate running loss and print every 2000 mini-batches running_loss += loss.item() if i % 2000 == 1999: # print every 2000 mini-batches print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000)) running_loss = 0.0 if self.save: torch.save(self.net.state_dict(), self.saveFile) def observationAction(self, observation): """ Get the agents action observation : list = observation representing game state """ observationTensor = torch.tensor(observation) outputs = self.net(observationTensor.float()) maxVal = torch.max(outputs, 0) return int(maxVal[1])