def __init__(self, cfg, tetris): self.num_actions = cfg.MODEL.SIZE_ACTION self.gamma = cfg.SOLVER.GAMMA self.BATCH_SIZE = cfg.SOLVER.BATCH_SIZE transition = namedtuple('Transicion', ('state', 'action', 'next_state', 'reward')) self.memory = ReplayMemory(cfg.SOLVER.CAPACITY, transition) self.model = get_model(cfg) self.target_net = copy.deepcopy(self.model) self.target_net.load_state_dict(self.model.state_dict()) self.optimizer = optim.Adam(self.model.parameters(), lr=0.001) self.tetris = tetris
def __init__(self, settings): assert(type(settings)==dict) # seed self.rng = settings['RNG'] # Epsilon self.epsilon_start = settings['EPSILON_START'] self.epsilon_end = settings['EPSILON_END'] self.epsilon_end_time = settings['EPSILON_END_TIME'] self.testing_epsilon = settings['TESTING_EPSILON'] self.epsilon_decay = (self.epsilon_start-self.epsilon_end)/float(self.epsilon_end_time) # Training self.learning_rate = settings['LEARNING_RATE'] self.rmsprop_rho = settings['RMSPROP_RHO'] self.rmsprop_epsilon = settings['RMSPROP_EPSILON'] self.target_net_update = settings['TARGET_NET_UPDATE'] self.min_reward = settings['MIN_REWARD'] self.max_reward = settings['MAX_REWARD'] # Q-Learning Parameters self.n_actions = settings['N_ACTIONS'] self.discount_factor = settings['DISCOUNT_FACTOR'] self.update_frequency = settings['UPDATE_FREQUENCY'] self.learn_start = settings['LEARN_START'] self.agent_history_length = settings['AGENT_HISTORY_LENGTH'] self.batch_size = settings['BATCH_SIZE'] # Preprocess self.resize_width = settings['RESIZE_WIDTH'] self.resize_height = settings['RESIZE_HEIGHT'] self.resize_dims = (self.resize_width, self.resize_height) self.net = DeepQNetwork( self.n_actions, self.agent_history_length, self.resize_height, self.resize_width ) self.target_net = DeepQNetwork( self.n_actions, self.agent_history_length, self.resize_height, self.resize_width ) self.target_net.setWeights(self.net.getWeights()) self.memory = ReplayMemory(settings) self.numSteps = 0 self.lastState = None self.lastAction = None self.lastTerminal = None self.compile()
def __init__( self, network: nn.Module, actions: int, logger: Optional = None, learning_rate: float = 0.00025, replay_start_size: int = 50000, replay_size: int = 1000000, batch_size: int = 32, sync_target_step: int = 10000, update_frequency: int = 4, gradient_clipping: bool = False, reward_clipping: bool = True, gamma: float = 0.99, epsilon_start: float = 1.0, epsilon_end: float = 0.1, epsilon_end_step: int = 1000000, epsilon_testing: float = 0.05, training: bool = True, device: str = 'gpu', seed: Optional[int] = None ): """ Initializes a DQN agent Args: network: a neural network to learn the Q-function actions: number of actions the agent can take logger: a logger that has a write method which receives scalars and a timestep learning_rate: the learning rate for the optimizer replay_start_size: minimum number of samples in memory before optimization starts, is also the number of time steps taken before reducing epsilon replay_size: maximum size of the replay buffer batch_size: number of samples for each parameter update sync_target_step: number of policy updates before updating the target network parameters update_frequency: number of time steps between each learning step gradient_clipping: if True, the gradients are clipped between -1 and 1 reward_clipping: if True, the rewards are clipped between -1 and 1 gamma: the discount factor for the MDP epsilon_start: value of epsilon at start of training epsilon_end: value of epsilon at end of training epsilon_end_step: number of time steps where the epsilon is linearly decayed epsilon_testing: value of epsilon during testing training: if True the agent is training if False is testing device: device to be used in pytorch, either gpu` or `cpu` seed: the random seed """ if seed is not None: torch.random.manual_seed(seed) # selecting the device to use self._device = torch.device("cuda" if torch.cuda.is_available() and device == 'gpu' else "cpu") print(f"Using {self._device}...") # creating the target network, eval doesn't do anything since we are not using dropout self._policy_network = network.to(self._device) self._target_network = deepcopy(self._policy_network).to(self._device) self._target_network.eval() # saving the logger if logger is not None: self._logger = logger # initializing the optimizer and saving some optimization related parameters self._learning_rate = learning_rate # self._optimizer = RMSprop(self._policy_network.parameters(), self._learning_rate) self._optimizer = torch.optim.Adam(self._policy_network.parameters(), lr=0.0000625, eps=0.00015) # self._optimizer = torch.optim.Adam(self._policy_network.parameters(), lr=0.0000125, eps=0.00015) self._batch_size = batch_size self._sync_target_step = sync_target_step self._update_frequency = update_frequency self._gradient_clipping = gradient_clipping self._loss_fn = torch.nn.L1Loss(reduction="none") self._reward_clipping = reward_clipping # setting the action space self._actions = actions self._num_steps = 0 # setting the replay buffer self._replay_start_size = replay_start_size self._replay_size = replay_size self._memory = ReplayMemory(size=replay_size, seed=seed) # setting the MDP parameters self._gamma = gamma # setting the exploration parameters self._epsilon_end = epsilon_end self._epsilon_diff = epsilon_start - epsilon_end self._epsilon_end_step = epsilon_end_step self._epsilon_testing = epsilon_testing self._epsilon = epsilon_start # setting the training status self._training = training self._timestep = None self._next_timestep = None
class VanillaDQN(acme.Actor): """Vanilla Deep Q-learning as implemented in the original Nature paper """ def __init__( self, network: nn.Module, actions: int, logger: Optional = None, learning_rate: float = 0.00025, replay_start_size: int = 50000, replay_size: int = 1000000, batch_size: int = 32, sync_target_step: int = 10000, update_frequency: int = 4, gradient_clipping: bool = False, reward_clipping: bool = True, gamma: float = 0.99, epsilon_start: float = 1.0, epsilon_end: float = 0.1, epsilon_end_step: int = 1000000, epsilon_testing: float = 0.05, training: bool = True, device: str = 'gpu', seed: Optional[int] = None ): """ Initializes a DQN agent Args: network: a neural network to learn the Q-function actions: number of actions the agent can take logger: a logger that has a write method which receives scalars and a timestep learning_rate: the learning rate for the optimizer replay_start_size: minimum number of samples in memory before optimization starts, is also the number of time steps taken before reducing epsilon replay_size: maximum size of the replay buffer batch_size: number of samples for each parameter update sync_target_step: number of policy updates before updating the target network parameters update_frequency: number of time steps between each learning step gradient_clipping: if True, the gradients are clipped between -1 and 1 reward_clipping: if True, the rewards are clipped between -1 and 1 gamma: the discount factor for the MDP epsilon_start: value of epsilon at start of training epsilon_end: value of epsilon at end of training epsilon_end_step: number of time steps where the epsilon is linearly decayed epsilon_testing: value of epsilon during testing training: if True the agent is training if False is testing device: device to be used in pytorch, either gpu` or `cpu` seed: the random seed """ if seed is not None: torch.random.manual_seed(seed) # selecting the device to use self._device = torch.device("cuda" if torch.cuda.is_available() and device == 'gpu' else "cpu") print(f"Using {self._device}...") # creating the target network, eval doesn't do anything since we are not using dropout self._policy_network = network.to(self._device) self._target_network = deepcopy(self._policy_network).to(self._device) self._target_network.eval() # saving the logger if logger is not None: self._logger = logger # initializing the optimizer and saving some optimization related parameters self._learning_rate = learning_rate # self._optimizer = RMSprop(self._policy_network.parameters(), self._learning_rate) self._optimizer = torch.optim.Adam(self._policy_network.parameters(), lr=0.0000625, eps=0.00015) # self._optimizer = torch.optim.Adam(self._policy_network.parameters(), lr=0.0000125, eps=0.00015) self._batch_size = batch_size self._sync_target_step = sync_target_step self._update_frequency = update_frequency self._gradient_clipping = gradient_clipping self._loss_fn = torch.nn.L1Loss(reduction="none") self._reward_clipping = reward_clipping # setting the action space self._actions = actions self._num_steps = 0 # setting the replay buffer self._replay_start_size = replay_start_size self._replay_size = replay_size self._memory = ReplayMemory(size=replay_size, seed=seed) # setting the MDP parameters self._gamma = gamma # setting the exploration parameters self._epsilon_end = epsilon_end self._epsilon_diff = epsilon_start - epsilon_end self._epsilon_end_step = epsilon_end_step self._epsilon_testing = epsilon_testing self._epsilon = epsilon_start # setting the training status self._training = training self._timestep = None self._next_timestep = None def select_action( self, observation: acme.types.NestedArray, ) -> acme.types.NestedArray: """Selects an action according to the epsilon greedy policy """ if self._exploration_rate <= torch.rand(1).item(): tensor_observation = torch.tensor([observation], dtype=torch.float32, device=self._device) # the action is selected with probability 1-epsilon according to the policy network with torch.no_grad(): q_values = self._policy_network(tensor_observation) return q_values.argmax().item() else: return torch.randint(high=self._actions, size=(1, )).item() def observe_first( self, timestep: dm_env.TimeStep, ): """Observes the first time step """ self._next_timestep = timestep def observe( self, action: acme.types.NestedArray, next_timestep: dm_env.TimeStep, ): """Observes a time step and saves a transition if the agent is training """ self._timestep = self._next_timestep self._next_timestep = next_timestep if self._training: # if the agent is training, saves the transition # (state, status, reward, action, next state, next status and next reward) transition = (self._timestep, action, self._next_timestep) self._memory.push(transition) self._num_steps += 1 # increment the number of steps the agent took # if a logger exists we also log the current epsilon if self._logger is not None: data = {'epsilon': self._epsilon, 'replay_size': len(self._memory)} self._logger.write(data, self._num_steps) def update( self, wait: bool = False ): """Performs a Q-learning update Args: wait: not used since the algorithm is single process """ # if the number of steps taken is larger than the initial number of samples needed # and the number of steps is a multiple of the update frequency an update is performed if (self._num_steps >= self._replay_start_size) and (self._num_steps % self._update_frequency == 0): # samples `batch_size` samples from memory transitions = self._memory.sample(self._batch_size) else: return device = self._device curr_transitions, actions, next_transitions = list(zip(*transitions)) actions = torch.tensor(actions, device=device) rewards = torch.tensor([x.reward for x in next_transitions], device=device) curr_observations = torch.stack([torch.from_numpy(x.observation) for x in curr_transitions]).float().to(device) next_observations = torch.stack([torch.from_numpy(x.observation) for x in next_transitions]).float().to(device) done_mask = torch.tensor([x.last() for x in next_transitions], device=device, dtype=torch.bool) # perform reward clipping if self._reward_clipping: rewards = rewards.clamp(-1, 1) curr_values = self._policy_network(curr_observations) # the value of the current state is the value of the action that was taken curr_state_values = curr_values.gather(1, actions.unsqueeze(-1)).squeeze(-1) with torch.no_grad(): next_values = self._target_network(next_observations) # the value of the next state is the maximum, we have to take the first element since max # returns a tuple of value, index next_state_values = next_values.max(1)[0] # the value of a terminal state is 0 next_state_values[done_mask] = 0.0 # computes the MSE Loss, using Q-learning estimate for state value loss = self._loss_fn(curr_state_values, rewards + next_state_values * self._gamma) loss[loss < 1] = loss[loss < 1] ** 2 loss = loss.mean() # resets the gradients computed in the optimizer and does backpropagation self._optimizer.zero_grad() loss.backward() # performs gradient clipping if self._gradient_clipping: for param in self._policy_network.parameters(): param.grad.data.clamp_(-1, 1) # updates the parameters self._optimizer.step() # periodically update the network if (self._num_steps // self._update_frequency) % self._sync_target_step == 0: model_parameters = self._policy_network.state_dict() # noinspection PyTypeChecker self._target_network.load_state_dict(model_parameters) if self._logger is not None: data = {'loss': loss} self._logger.write(data, self._num_steps) @property def _exploration_rate(self): """Exploration rate (epsilon) which decays linearly during training """ if self._training: time_diff = (self._epsilon_end_step - max(0, self._num_steps - self._replay_start_size)) epsilon = self._epsilon_end + max(0., (self._epsilon_diff * time_diff) / self._epsilon_end_step) else: epsilon = self._epsilon_testing self._epsilon = epsilon return epsilon def training(self): """Changes the agent mode to train """ self._training = True def testing(self): """Changes the agent mode to test """ self._training = False
class Brain: def __init__(self, cfg, tetris): self.num_actions = cfg.MODEL.SIZE_ACTION self.gamma = cfg.SOLVER.GAMMA self.BATCH_SIZE = cfg.SOLVER.BATCH_SIZE transition = namedtuple('Transicion', ('state', 'action', 'next_state', 'reward')) self.memory = ReplayMemory(cfg.SOLVER.CAPACITY, transition) self.model = get_model(cfg) self.target_net = copy.deepcopy(self.model) self.target_net.load_state_dict(self.model.state_dict()) self.optimizer = optim.Adam(self.model.parameters(), lr=0.001) self.tetris = tetris def replay(self): if len(self.memory) < self.BATCH_SIZE: return transitions = self.memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) self.model.eval() state_action_values = self.model(state_batch).gather(1, action_batch) non_final_mask = torch.ByteTensor( tuple(map(lambda s: s is not None, batch.next_state))) next_state_values = torch.zeros(BATCH_SIZE) self.target_net.eval() next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch self.model.train() loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def update_target_model(self): self.target_net.load_state_dict(self.model.state_dict()) def decide_action(self, state, mino, episode): epsilon = 0.41 * (1 / (episode + 1)) if epsilon <= np.random.uniform(0, 1): self.model.eval() with torch.no_grad(): action = self.tetris.get_masked_action(self.model(state), mino) else: action = torch.LongTensor( [[self.tetris.get_random_masked_action(mino)]]) return action def brain_predict(self, state): self.model.eval() with torch.no_grad(): action = self.model(state).max(1)[1].view(1, 1) return action
def main(): parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='Breakout-v0', help='Atari env name') parser.add_argument( '-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--input_shape', nargs=2, type=int, default=None, help='Input shape') parser.add_argument('--num_frame', default=4, type=int, help='Number of frames in a state') parser.add_argument('--discount', default=0.99, type=float, help='Discount factor gamma') parser.add_argument('--online_train_interval', default=4, type=int, help='Interval to train the online network') parser.add_argument('--target_reset_interval', default=10000, type=int, help='Interval to reset the target network') parser.add_argument('--action_change_interval', default=1, type=int, help='Interval to change action') parser.add_argument('--print_loss_interval', default=100, type=int, help='Interval to print losses') parser.add_argument('--replay_buffer_size', default=100000, type=int, help='Replay buffer size') parser.add_argument('--num_burn_in', default=25000, type=int, help='Number of samples filled in memory before update') parser.add_argument('--batch_size', default=32, type=int, help='How many samples in each minibatch') parser.add_argument('--learning_rate', default=1e-4, type=float, help='Learning rate alpha') parser.add_argument('--explore_prob', default=0.05, type=float, help='Exploration probability in epsilon-greedy') parser.add_argument('--decay_prob_start', default=1.0, type=float, help='Starting probability in linear-decay epsilon-greedy') parser.add_argument('--decay_prob_end', default=0.1, type=float, help='Ending probability in linear-decay epsilon-greedy') parser.add_argument('--decay_steps', default=1000000, type=int, help='Decay steps in linear-decay epsilon-greedy') parser.add_argument('--num_train', default=5000000, type=int, help='Number of training sampled interactions with the environment') parser.add_argument('--max_episode_length', default=999999, type=int, help='Maximum length of an episode') parser.add_argument('--save_interval', default=100000, type=int, help='Interval to save weights and memory') parser.add_argument('--model_name', default='dqn', type=str, help='Model name') parser.add_argument('--eval_interval', default=10000, type=int, help='Evaluation interval') parser.add_argument('--eval_episodes', default=20, type=int, help='Number of episodes in evaluation') parser.add_argument('--double_q', default=False, type=bool, help='Invoke double Q net') parser.add_argument('--do_render', default=False, type=bool, help='Do rendering or not') parser.add_argument('--read_weights', default=None, type=str, help='Read weights from file') parser.add_argument('--read_memory', default=None, type=str, help='Read memory from file') args = parser.parse_args() print '########## All arguments ##########:', args args.input_shape = tuple(args.input_shape) args.output = get_output_folder(args.output, args.env) env = gym.make(args.env) num_actions = env.action_space.n opt_adam = Adam(lr=args.learning_rate) model_online = create_model(args.num_frame, args.input_shape, num_actions, model_name=args.model_name) model_target = create_model(args.num_frame, args.input_shape, num_actions, model_name=args.model_name) q_network = {'online': model_online, 'target': model_target} preproc = AtariPreprocessor(args.input_shape) memory = ReplayMemory(args.replay_buffer_size, args.num_frame) policy_random = UniformRandomPolicy(num_actions) policy_train = LinearDecayGreedyEpsilonPolicy(args.decay_prob_start, args.decay_prob_end, args.decay_steps) policy_eval = GreedyEpsilonPolicy(args.explore_prob) policy = {'random': policy_random, 'train': policy_train, 'eval': policy_eval} agent = DQNAgent(num_actions, q_network, preproc, memory, policy, args) agent.compile([mean_huber_loss, null_loss], opt_adam) if args.read_weights is not None: agent.q_network['online'].load_weights(args.read_weights) if args.read_memory is not None: with open(args.read_memory, 'rb') as save_memory: agent.memory = pickle.load(save_memory) print '########## training #############' agent.fit(env)
# helper method for reshaping the cartpole observation def reshape(state): return np.reshape(state, [1, 4]) if __name__ == '__main__': tf.compat.v1.disable_eager_execution() max_score = 0 n_episodes = 5000 max_env_steps = 1000 env = gym.make('CartPole-v0') agent = DQNAgent(env=env, net=NN(alpha=0.001, decay=0.0001), memory=ReplayMemory(size=100000)) if max_env_steps is not None: env._max_episode_steps = max_env_steps for e in range(n_episodes): # reset the env state = reshape(env.reset()) done = False score = 0 # play until env done while not done: action = agent.act(state) next_state, reward, done, _ = env.step(action) # env.render() next_state = reshape(next_state)
import gym from dqn.bots import AtariBot from dqn.policy import DDQNPolicy from dqn.memory import ReplayMemory GAME = 'Breakout-v0' #TODO List params to tune here, eventually migrate this to a readme if __name__ == "__main__": policy = DDQNPolicy() memory = ReplayMemory() game = gym.make(GAME) game.ale.setInt(b'frame_skip', 4) robot = AtariBot(policy=policy, memory=memory) robot.train(game=game, ckpt_dir="models")
class NeuralQLearner: def __init__(self, settings): assert(type(settings)==dict) # seed self.rng = settings['RNG'] # Epsilon self.epsilon_start = settings['EPSILON_START'] self.epsilon_end = settings['EPSILON_END'] self.epsilon_end_time = settings['EPSILON_END_TIME'] self.testing_epsilon = settings['TESTING_EPSILON'] self.epsilon_decay = (self.epsilon_start-self.epsilon_end)/float(self.epsilon_end_time) # Training self.learning_rate = settings['LEARNING_RATE'] self.rmsprop_rho = settings['RMSPROP_RHO'] self.rmsprop_epsilon = settings['RMSPROP_EPSILON'] self.target_net_update = settings['TARGET_NET_UPDATE'] self.min_reward = settings['MIN_REWARD'] self.max_reward = settings['MAX_REWARD'] # Q-Learning Parameters self.n_actions = settings['N_ACTIONS'] self.discount_factor = settings['DISCOUNT_FACTOR'] self.update_frequency = settings['UPDATE_FREQUENCY'] self.learn_start = settings['LEARN_START'] self.agent_history_length = settings['AGENT_HISTORY_LENGTH'] self.batch_size = settings['BATCH_SIZE'] # Preprocess self.resize_width = settings['RESIZE_WIDTH'] self.resize_height = settings['RESIZE_HEIGHT'] self.resize_dims = (self.resize_width, self.resize_height) self.net = DeepQNetwork( self.n_actions, self.agent_history_length, self.resize_height, self.resize_width ) self.target_net = DeepQNetwork( self.n_actions, self.agent_history_length, self.resize_height, self.resize_width ) self.target_net.setWeights(self.net.getWeights()) self.memory = ReplayMemory(settings) self.numSteps = 0 self.lastState = None self.lastAction = None self.lastTerminal = None self.compile() def compile(self): input_shape = ( self.batch_size, self.agent_history_length, self.resize_height, self.resize_width ) pred_input_shape = ( 1, self.agent_history_length, self.resize_height, self.resize_width ) self.pred_input = shared(np.zeros(pred_input_shape, dtype=floatX)) self.net_input = shared(np.zeros(input_shape, dtype=floatX)) self.target_net_input = shared(np.zeros(input_shape, dtype=floatX)) self.shared_actions = shared(np.zeros((self.batch_size,), dtype='int32')) self.shared_rewards = shared(np.zeros((self.batch_size,), dtype='int32')) self.shared_terminals = shared(np.zeros((self.batch_size,), dtype=floatX)) actions = T.ivector() rewards = T.ivector() terminals = T.vector() targets = (rewards + (T.ones_like(terminals) - terminals) * self.discount_factor * T.max(self.target_net.qvalues, axis=1)) diff = targets - self.net.qvalues[T.arange(self.batch_size), actions] qp = T.minimum(abs(diff), 1.0) lp = abs(diff) - qp delta = 0.5 * qp ** 2 + lp cost = T.sum(delta) optimizer = RMSprop( cost, self.net.params, lr=self.learning_rate, rho=self.rmsprop_rho, epsilon=self.rmsprop_epsilon ) givens = { self.net.input: self.net_input, self.target_net.input: self.target_net_input, actions: self.shared_actions, rewards: self.shared_rewards, terminals: self.shared_terminals } self.train = function( inputs=[], outputs=cost, updates=optimizer.getUpdates(), givens=givens ) self.prediction = function( inputs=[], outputs=self.net.qvalues.flatten(1), givens={ self.net.input: self.pred_input } ) def preprocess(self, rawstate): return cv2.resize(rawstate, self.resize_dims, interpolation=cv2.INTER_LINEAR) def getEpsilon(self): current_epsilon = self.epsilon_start - (self.numSteps * self.epsilon_decay) return max(self.epsilon_end, current_epsilon) def qLearnMinibatch(self): s1, a, r, t, s2 = self.memory.sampleMinibatch() # borrow=True para que no se haga una copia del arreglo y sea mas rapido self.net_input.set_value(s1, borrow=True) self.shared_actions.set_value(a, borrow=True) self.shared_rewards.set_value(r, borrow=True) self.shared_terminals.set_value(t, borrow=True) self.target_net_input.set_value(s2, borrow=True) return self.train() def perceive(self, rawstate, reward, terminal, testing): state = self.preprocess(rawstate) reward = max(reward, self.min_reward) reward = min(reward, self.max_reward) self.memory.storeRecentState(state, terminal) if((not testing) and (self.lastState is not None)): self.memory.storeTransition(self.lastState, self.lastAction, reward, self.lastTerminal) actionIndex = 0 if(not terminal): actionIndex = self.eGreedy(testing) flag1 = (self.numSteps > self.learn_start) flag2 = (self.numSteps % self.update_frequency == 0) # Short-Circuit-Eval ... if((not testing) and flag1 and flag2): cost = self.qLearnMinibatch() if(self.numSteps % self.target_net_update == 0): self.target_net.setWeights(self.net.getWeights()) self.lastState = state self.lastAction = actionIndex self.lastTerminal = terminal if(not testing): self.numSteps += 1 return actionIndex def eGreedy(self, testing): epsilon = self.testing_epsilon if(testing) else self.getEpsilon() if(self.rng.uniform(0,1) < epsilon): return self.rng.randint(0, self.n_actions) else: return self.greedy() def greedy(self): curState = self.memory.getRecentState() curState = curState.reshape(1, curState.shape[0], curState.shape[1], curState.shape[2]) self.pred_input.set_value(curState, borrow=True) q = self.prediction() maxq = q[0] besta = [0] for a in xrange(1, self.n_actions): if(q[a] > maxq): maxq = q[a] besta = [a] elif(q[a] == maxq): besta.append(a) r = self.rng.randint(0, len(besta)) return besta[r]