class Agent: def __init__(self, environment, replay_memory, deep_q_network, args): self.env = environment self.mem = replay_memory self.net = deep_q_network self.buf = StateBuffer(args) self.num_actions = self.env.numActions() self.random_starts = args.random_starts self.history_length = args.history_length self.exploration_rate_start = args.exploration_rate_start self.exploration_rate_end = args.exploration_rate_end self.exploration_decay_steps = args.exploration_decay_steps self.exploration_rate_test = args.exploration_rate_test self.total_train_steps = args.start_epoch * args.train_steps self.train_frequency = args.train_frequency self.train_repeat = args.train_repeat self.callback = None def _restartRandom(self): self.env.restart() # perform random number of dummy actions to produce more stochastic games for i in xrange(random.randint(self.history_length, self.random_starts) + 1): reward = self.env.act(0) screen = self.env.getScreen() terminal = self.env.isTerminal() assert not terminal, "terminal state occurred during random initialization" # add dummy states to buffer self.buf.add(screen) def _explorationRate(self): # calculate decaying exploration rate if self.total_train_steps < self.exploration_decay_steps: return self.exploration_rate_start - self.total_train_steps * (self.exploration_rate_start - self.exploration_rate_end) / self.exploration_decay_steps else: return self.exploration_rate_end def step(self, exploration_rate): # exploration rate determines the probability of random moves if random.random() < exploration_rate: action = random.randrange(self.num_actions) logger.debug("Random action = %d" % action) else: # otherwise choose action with highest Q-value state = self.buf.getStateMinibatch() # for convenience getStateMinibatch() returns minibatch # where first item is the current state qvalues = self.net.predict(state) assert len(qvalues[0]) == self.num_actions # choose highest Q-value of first state action = np.argmax(qvalues[0]) logger.debug("Predicted action = %d" % action) # perform the action reward = self.env.act(action) screen = self.env.getScreen() terminal = self.env.isTerminal() # print reward if reward <> 0: logger.debug("Reward: %d" % reward) # add screen to buffer self.buf.add(screen) # restart the game if over if terminal: logger.debug("Terminal state, restarting") self._restartRandom() # call callback to record statistics if self.callback: self.callback.on_step(action, reward, terminal, screen, exploration_rate) return action, reward, screen, terminal def play_random(self, random_steps): # play given number of steps for i in xrange(random_steps): # use exploration rate 1 = completely random self.step(1) def train(self, train_steps, epoch = 0): # do not do restart here, continue from testing #self._restartRandom() # play given number of steps for i in xrange(train_steps): # perform game step action, reward, screen, terminal = self.step(self._explorationRate()) self.mem.add(action, reward, screen, terminal) # train after every train_frequency steps if self.mem.count > self.mem.batch_size and i % self.train_frequency == 0: # train for train_repeat times for j in xrange(self.train_repeat): #logger.info("i=%d, j=%d, mem.count=%d" % (i, j, self.mem.count)) # sample minibatch minibatch = self.mem.getMinibatch() # train the network self.net.train(minibatch, epoch) # increase number of training steps for epsilon decay self.total_train_steps += 1 def test(self, test_steps, epoch = 0): # just make sure there is history_length screens to form a state self._restartRandom() # play given number of steps for i in xrange(test_steps): # perform game step self.step(self.exploration_rate_test) def play(self, num_games): # just make sure there is history_length screens to form a state self._restartRandom() for i in xrange(num_games): # play until terminal state terminal = False while not terminal: action, reward, screen, terminal = self.step(self.exploration_rate_test) # add experiences to replay memory for visualization self.mem.add(action, reward, screen, terminal)
class Agent: def __init__(self, environment, replay_memory, deep_q_network, args): self.env = environment self.mem = replay_memory self.net = deep_q_network self.buf = StateBuffer(args) self.num_actions = self.env.numActions() self.random_starts = args.random_starts self.history_length = args.history_length self.exploration_rate_start = args.exploration_rate_start self.exploration_rate_end = args.exploration_rate_end self.exploration_decay_steps = args.exploration_decay_steps self.exploration_rate_test = args.exploration_rate_test self.total_train_steps = args.start_epoch * args.train_steps self.train_frequency = args.train_frequency self.train_repeat = args.train_repeat self.target_steps = args.target_steps self.callback = None def _restartRandom(self): self.env.restart() # perform random number of dummy actions to produce more stochastic games for i in range(random.randint(self.history_length, self.random_starts) + 1): reward = self.env.act(0) terminal = self.env.isTerminal() if terminal: self.env.restart() screen = self.env.getScreen() # add dummy states to buffer self.buf.add(screen) def _explorationRate(self): # calculate decaying exploration rate if self.total_train_steps < self.exploration_decay_steps: return self.exploration_rate_start - self.total_train_steps * (self.exploration_rate_start - self.exploration_rate_end) / self.exploration_decay_steps else: return self.exploration_rate_end def step(self, exploration_rate): # exploration rate determines the probability of random moves if random.random() < exploration_rate: action = random.randrange(self.num_actions) logger.debug("Random action = %d" % action) else: # otherwise choose action with highest Q-value state = self.buf.getStateMinibatch() # for convenience getStateMinibatch() returns minibatch # where first item is the current state qvalues = self.net.predict(state) assert len(qvalues[0]) == self.num_actions # choose highest Q-value of first state action = np.argmax(qvalues[0]) logger.debug("Predicted action = %d" % action) # perform the action reward = self.env.act(action) screen = self.env.getScreen() terminal = self.env.isTerminal() # print reward if reward != 0: logger.debug("Reward: %d" % reward) # add screen to buffer self.buf.add(screen) # restart the game if over if terminal: logger.debug("Terminal state, restarting") self._restartRandom() # call callback to record statistics if self.callback: self.callback.on_step(action, reward, terminal, screen, exploration_rate) return action, reward, screen, terminal def play_random(self, random_steps): #call env.restart first so that env.reset is called before step. self.env.restart() # play given number of steps for i in range(random_steps): # use exploration rate 1 = completely random action, reward, screen, terminal = self.step(1) self.mem.add(action, reward, screen, terminal) def train(self, train_steps, epoch = 0): # do not do restart here, continue from testing #self._restartRandom() # play given number of steps for i in range(train_steps): # perform game step action, reward, screen, terminal = self.step(self._explorationRate()) self.mem.add(action, reward, screen, terminal) # Update target network every target_steps steps if self.target_steps and i % self.target_steps == 0: self.net.update_target_network() # train after every train_frequency steps if self.mem.count > self.mem.batch_size and i % self.train_frequency == 0: # train for train_repeat times for j in range(self.train_repeat): # sample minibatch minibatch = self.mem.getMinibatch() # train the network self.net.train(minibatch, epoch) # increase number of training steps for epsilon decay self.total_train_steps += 1 def test(self, test_steps, epoch = 0): # just make sure there is history_length screens to form a state self._restartRandom() # play given number of steps for i in range(test_steps): # perform game step self.step(self.exploration_rate_test) def play(self, num_games): # just make sure there is history_length screens to form a state self._restartRandom() for i in range(num_games): # play until terminal state terminal = False while not terminal: action, reward, screen, terminal = self.step(self.exploration_rate_test) # add experiences to replay memory for visualization self.mem.add(action, reward, screen, terminal)
class GymAgent(object): def __init__(self, env=Breakout - v0, net, replay_memory, exploration_strategy, args): self.env = env self.net = net self.mem = replay_memory self.exporation_strategy = exporation_strategy self.buf = StateBuffer(args) self.history_length = args.history_length #self.exploration_train_strategy = exploration_strategy.args.exploration_train_strategy #self.exploration_test_strategy = exploration_strategy.args.exploration_test_strategy self.train_net_frequency = args.train_net_frequency self.train_net_repeat = args.train_net_repeat def _restart_random(self): self.env.reset() # perform random number of dummy actions to produce more stochastic games for t in xrange( random.randint(self.history_length, self.random_starts) + 1): self.mem.action = self.env.action_space.sample() self.mem.observation, self.mem.reward, self.mem.done, self.mem.info = self.env.step( self.mem.action) assert not self.env.done, "done state occurred during random initialization" # add dummy states to buffer #to be merged in replay_memor=self.mem here self.buf.add(observation) def act(self, exploration_strategy): # FOR BASE AGENT, perhasp use: raise NotImplementedError callbacks.on_act_begin() # determine whether to explore action = exploration_strategy() if action: logger.debug("Explore action = {}".format(action)) else: # otherwise choose action with highest Q-value state = self.buf.getStateMinibatch() # for convenience getStateMinibatch() returns minibatch # where first item is the current state qvalues = self.net.predict(state) assert len(qvalues[0]) == self.env.action_space.n # choose highest Q-value of first state action = np.argmax(qvalues[0]) logger.debug("Predicted action = {}".format(action)) # perform the action, and update replay_memory self.mem.action = action self.mem.observation, self.mem.reward, self.mem.done, self.mem.info = self.env.step( self.mem.action) # add screen to buffer #self.buf.add(observation) # restart the game if over if done: self._restart_random() # call callback to log progress #MOVE THIS TO CALLBACK SELF.AGENT (need to add self stuff above - NO! USE e.g. buf.observations[last (obvisously replace with the actual number)]): ## act_logs = {} ## act_logs['observation'] = observation ## act_logs['done'] = done ## act_logs['reward'] = reward ## act_logs['t'] = t self.callback.on_act_end(act) #see statistics vs monitor return action, observation, reward, done, info def train(self, train_steps, episode=0): #CHECK WHY, INPARTICULAR SURELY WE DON'T NECCESSARILY HAVE 4STATES FOR CONVNET??? # do not do restart here, continue from testing #self._restart_random() # play given number of steps for t in xrange(train_steps): # update agent replay memory regarding t self.mem.t = t # perform game step self.act(self.exploration_train_strategy) # train after every train_frequency steps if self.mem.count > self.mem.batch_size and t % self.train_frequency == 0: # train for train_repeat times for j in xrange(self.train_net_repeat): # sample minibatch minibatch = self.mem.getMinibatch() # train the network self.net.train(minibatch, episode) # restart the game if over if self.mem.done: # just make sure there is history_length screens to form a state # perform random number of dummy actions to produce more stochastic games if t < random.randint(self.history_length, self.random_starts) + 1: self.act(self.exploration_strategy.play_random) def test(self, test_steps, episode=0): # play given number of steps for t in xrange(test_steps): # update agent replay memory regarding t # check if we trained if t == 0: test_start_t = self.mem.t # reset environment self.env.reset() self.mem.t = test_start_t + t # just make sure there is history_length screens to form a state # perform random number of dummy actions to produce more stochastic games if t < random.randint(self.history_length, self.random_starts) + 1: self.act(self.exploration_strategy.play_random) # perform game step self.act(self.exploration_test_strategy) def play(self, num_games): for t in xrange(num_games): # just make sure there is history_length screens to form a state # perform random number of dummy actions to produce more stochastic games if t < random.randint(self.history_length, self.random_starts) + 1: self.act(self.exploration_strategy.play_random) # play until terminal state while not self.mem.done: self.act(t, self.exploration_test_strategy)