예제 #1
0
    def __init__(self, cfg, tetris):
        self.num_actions = cfg.MODEL.SIZE_ACTION
        self.gamma = cfg.SOLVER.GAMMA
        self.BATCH_SIZE = cfg.SOLVER.BATCH_SIZE

        transition = namedtuple('Transicion',
                                ('state', 'action', 'next_state', 'reward'))
        self.memory = ReplayMemory(cfg.SOLVER.CAPACITY, transition)
        self.model = get_model(cfg)

        self.target_net = copy.deepcopy(self.model)
        self.target_net.load_state_dict(self.model.state_dict())

        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

        self.tetris = tetris
예제 #2
0
    def __init__(self, settings):

        assert(type(settings)==dict)

        # seed
        self.rng = settings['RNG']

        # Epsilon
        self.epsilon_start = settings['EPSILON_START']
        self.epsilon_end = settings['EPSILON_END']
        self.epsilon_end_time = settings['EPSILON_END_TIME']
        self.testing_epsilon = settings['TESTING_EPSILON']
        self.epsilon_decay = (self.epsilon_start-self.epsilon_end)/float(self.epsilon_end_time)

        # Training
        self.learning_rate = settings['LEARNING_RATE']
        self.rmsprop_rho = settings['RMSPROP_RHO']
        self.rmsprop_epsilon = settings['RMSPROP_EPSILON']
        self.target_net_update = settings['TARGET_NET_UPDATE']
        self.min_reward = settings['MIN_REWARD']
        self.max_reward = settings['MAX_REWARD']

        # Q-Learning Parameters
        self.n_actions = settings['N_ACTIONS']
        self.discount_factor = settings['DISCOUNT_FACTOR']
        self.update_frequency = settings['UPDATE_FREQUENCY']
        self.learn_start = settings['LEARN_START']
        self.agent_history_length = settings['AGENT_HISTORY_LENGTH']
        self.batch_size = settings['BATCH_SIZE']

        # Preprocess
        self.resize_width = settings['RESIZE_WIDTH']
        self.resize_height = settings['RESIZE_HEIGHT']
        self.resize_dims = (self.resize_width, self.resize_height)

        self.net = DeepQNetwork(
            self.n_actions,
            self.agent_history_length,
            self.resize_height,
            self.resize_width
        )
        self.target_net = DeepQNetwork(
            self.n_actions,
            self.agent_history_length,
            self.resize_height,
            self.resize_width
        )
        self.target_net.setWeights(self.net.getWeights())

        self.memory = ReplayMemory(settings)

        self.numSteps = 0
        self.lastState = None
        self.lastAction = None
        self.lastTerminal = None

        self.compile()
예제 #3
0
 def __init__(
         self,
         network: nn.Module,
         actions: int,
         logger: Optional = None,
         learning_rate: float = 0.00025,
         replay_start_size: int = 50000,
         replay_size: int = 1000000,
         batch_size: int = 32,
         sync_target_step: int = 10000,
         update_frequency: int = 4,
         gradient_clipping: bool = False,
         reward_clipping: bool = True,
         gamma: float = 0.99,
         epsilon_start: float = 1.0,
         epsilon_end: float = 0.1,
         epsilon_end_step: int = 1000000,
         epsilon_testing: float = 0.05,
         training: bool = True,
         device: str = 'gpu',
         seed: Optional[int] = None
 ):
     """
     Initializes a DQN agent
     
     Args:
         network: a neural network to learn the Q-function
         actions: number of actions the agent can take
         logger: a logger that has a write method which receives scalars and a timestep
         learning_rate: the learning rate for the optimizer
         replay_start_size: minimum number of samples in memory before optimization starts, is also the
             number of time steps taken before reducing epsilon
         replay_size: maximum size of the replay buffer
         batch_size: number of samples for each parameter update
         sync_target_step: number of policy updates before updating the target network parameters
         update_frequency: number of time steps between each learning step
         gradient_clipping: if True, the gradients are clipped between -1 and 1
         reward_clipping: if True, the rewards are clipped between -1 and 1
         gamma: the discount factor for the MDP
         epsilon_start: value of epsilon at start of training
         epsilon_end: value of epsilon at end of training
         epsilon_end_step: number of time steps where the epsilon is linearly decayed
         epsilon_testing: value of epsilon during testing
         training: if True the agent is training if False is testing
         device: device to be used in pytorch, either gpu` or `cpu`
         seed: the random seed
     """
     
     if seed is not None:
         torch.random.manual_seed(seed)
     
     # selecting the device to use
     self._device = torch.device("cuda" if torch.cuda.is_available() and device == 'gpu' else "cpu")
     print(f"Using {self._device}...")
     
     # creating the target network, eval doesn't do anything since we are not using dropout
     self._policy_network = network.to(self._device)
     self._target_network = deepcopy(self._policy_network).to(self._device)
     self._target_network.eval()
     
     # saving the logger
     if logger is not None:
         self._logger = logger
     
     # initializing the optimizer and saving some optimization related parameters
     self._learning_rate = learning_rate
     # self._optimizer = RMSprop(self._policy_network.parameters(), self._learning_rate)
     self._optimizer = torch.optim.Adam(self._policy_network.parameters(), lr=0.0000625, eps=0.00015)
     # self._optimizer = torch.optim.Adam(self._policy_network.parameters(), lr=0.0000125, eps=0.00015)
     self._batch_size = batch_size
     self._sync_target_step = sync_target_step
     self._update_frequency = update_frequency
     self._gradient_clipping = gradient_clipping
     self._loss_fn = torch.nn.L1Loss(reduction="none")
     self._reward_clipping = reward_clipping
     
     # setting the action space
     self._actions = actions
     self._num_steps = 0
     
     # setting the replay buffer
     self._replay_start_size = replay_start_size
     self._replay_size = replay_size
     self._memory = ReplayMemory(size=replay_size, seed=seed)
     
     # setting the MDP parameters
     self._gamma = gamma
     
     # setting the exploration parameters
     self._epsilon_end = epsilon_end
     self._epsilon_diff = epsilon_start - epsilon_end
     self._epsilon_end_step = epsilon_end_step
     self._epsilon_testing = epsilon_testing
     self._epsilon = epsilon_start
     
     # setting the training status
     self._training = training
     
     self._timestep = None
     self._next_timestep = None
예제 #4
0
class VanillaDQN(acme.Actor):
    """Vanilla Deep Q-learning as implemented in the original Nature paper
    """
    
    def __init__(
            self,
            network: nn.Module,
            actions: int,
            logger: Optional = None,
            learning_rate: float = 0.00025,
            replay_start_size: int = 50000,
            replay_size: int = 1000000,
            batch_size: int = 32,
            sync_target_step: int = 10000,
            update_frequency: int = 4,
            gradient_clipping: bool = False,
            reward_clipping: bool = True,
            gamma: float = 0.99,
            epsilon_start: float = 1.0,
            epsilon_end: float = 0.1,
            epsilon_end_step: int = 1000000,
            epsilon_testing: float = 0.05,
            training: bool = True,
            device: str = 'gpu',
            seed: Optional[int] = None
    ):
        """
        Initializes a DQN agent
        
        Args:
            network: a neural network to learn the Q-function
            actions: number of actions the agent can take
            logger: a logger that has a write method which receives scalars and a timestep
            learning_rate: the learning rate for the optimizer
            replay_start_size: minimum number of samples in memory before optimization starts, is also the
                number of time steps taken before reducing epsilon
            replay_size: maximum size of the replay buffer
            batch_size: number of samples for each parameter update
            sync_target_step: number of policy updates before updating the target network parameters
            update_frequency: number of time steps between each learning step
            gradient_clipping: if True, the gradients are clipped between -1 and 1
            reward_clipping: if True, the rewards are clipped between -1 and 1
            gamma: the discount factor for the MDP
            epsilon_start: value of epsilon at start of training
            epsilon_end: value of epsilon at end of training
            epsilon_end_step: number of time steps where the epsilon is linearly decayed
            epsilon_testing: value of epsilon during testing
            training: if True the agent is training if False is testing
            device: device to be used in pytorch, either gpu` or `cpu`
            seed: the random seed
        """
        
        if seed is not None:
            torch.random.manual_seed(seed)
        
        # selecting the device to use
        self._device = torch.device("cuda" if torch.cuda.is_available() and device == 'gpu' else "cpu")
        print(f"Using {self._device}...")
        
        # creating the target network, eval doesn't do anything since we are not using dropout
        self._policy_network = network.to(self._device)
        self._target_network = deepcopy(self._policy_network).to(self._device)
        self._target_network.eval()
        
        # saving the logger
        if logger is not None:
            self._logger = logger
        
        # initializing the optimizer and saving some optimization related parameters
        self._learning_rate = learning_rate
        # self._optimizer = RMSprop(self._policy_network.parameters(), self._learning_rate)
        self._optimizer = torch.optim.Adam(self._policy_network.parameters(), lr=0.0000625, eps=0.00015)
        # self._optimizer = torch.optim.Adam(self._policy_network.parameters(), lr=0.0000125, eps=0.00015)
        self._batch_size = batch_size
        self._sync_target_step = sync_target_step
        self._update_frequency = update_frequency
        self._gradient_clipping = gradient_clipping
        self._loss_fn = torch.nn.L1Loss(reduction="none")
        self._reward_clipping = reward_clipping
        
        # setting the action space
        self._actions = actions
        self._num_steps = 0
        
        # setting the replay buffer
        self._replay_start_size = replay_start_size
        self._replay_size = replay_size
        self._memory = ReplayMemory(size=replay_size, seed=seed)
        
        # setting the MDP parameters
        self._gamma = gamma
        
        # setting the exploration parameters
        self._epsilon_end = epsilon_end
        self._epsilon_diff = epsilon_start - epsilon_end
        self._epsilon_end_step = epsilon_end_step
        self._epsilon_testing = epsilon_testing
        self._epsilon = epsilon_start
        
        # setting the training status
        self._training = training
        
        self._timestep = None
        self._next_timestep = None
    
    def select_action(
        self,
        observation: acme.types.NestedArray,
    ) -> acme.types.NestedArray:
        
        """Selects an action according to the epsilon greedy policy
        """
        
        if self._exploration_rate <= torch.rand(1).item():
            tensor_observation = torch.tensor([observation], dtype=torch.float32, device=self._device)
            
            # the action is selected with probability 1-epsilon according to the policy network
            with torch.no_grad():
                q_values = self._policy_network(tensor_observation)
                return q_values.argmax().item()
        
        else:
            return torch.randint(high=self._actions, size=(1, )).item()
    
    def observe_first(
            self,
            timestep: dm_env.TimeStep,
    ):
        """Observes the first time step
        """
        
        self._next_timestep = timestep
    
    def observe(
            self,
            action: acme.types.NestedArray,
            next_timestep: dm_env.TimeStep,
    ):
        """Observes a time step and saves a transition if the agent is training
        """
        self._timestep = self._next_timestep
        self._next_timestep = next_timestep
        
        if self._training:
            # if the agent is training, saves the transition
            # (state, status, reward, action, next state, next status and next reward)
            transition = (self._timestep, action, self._next_timestep)
            self._memory.push(transition)
            self._num_steps += 1  # increment the number of steps the agent took
            
            # if a logger exists we also log the current epsilon
            if self._logger is not None:
                data = {'epsilon': self._epsilon, 'replay_size': len(self._memory)}
                self._logger.write(data, self._num_steps)
    
    def update(
            self,
            wait: bool = False
    ):
        """Performs a Q-learning update
        
        Args:
            wait: not used since the algorithm is single process
        """
        
        # if the number of steps taken is larger than the initial number of samples needed
        # and the number of steps is a multiple of the update frequency an update is performed
        if (self._num_steps >= self._replay_start_size) and (self._num_steps % self._update_frequency == 0):
            # samples `batch_size` samples from memory
            transitions = self._memory.sample(self._batch_size)
        else:
            return

        device = self._device
        
        curr_transitions, actions, next_transitions = list(zip(*transitions))
        
        actions = torch.tensor(actions, device=device)
        rewards = torch.tensor([x.reward for x in next_transitions], device=device)
        curr_observations = torch.stack([torch.from_numpy(x.observation) for x in curr_transitions]).float().to(device)
        next_observations = torch.stack([torch.from_numpy(x.observation) for x in next_transitions]).float().to(device)
        done_mask = torch.tensor([x.last() for x in next_transitions], device=device, dtype=torch.bool)
        
        # perform reward clipping
        if self._reward_clipping:
            rewards = rewards.clamp(-1, 1)
        
        curr_values = self._policy_network(curr_observations)
        
        # the value of the current state is the value of the action that was taken
        curr_state_values = curr_values.gather(1, actions.unsqueeze(-1)).squeeze(-1)
        
        with torch.no_grad():
            next_values = self._target_network(next_observations)

            # the value of the next state is the maximum, we have to take the first element since max
            # returns a tuple of value, index
            next_state_values = next_values.max(1)[0]

            # the value of a terminal state is 0
            next_state_values[done_mask] = 0.0

        # computes the MSE Loss, using Q-learning estimate for state value
        loss = self._loss_fn(curr_state_values, rewards + next_state_values * self._gamma)
        
        loss[loss < 1] = loss[loss < 1] ** 2
        loss = loss.mean()
        
        # resets the gradients computed in the optimizer and does backpropagation
        self._optimizer.zero_grad()
        loss.backward()
        
        # performs gradient clipping
        if self._gradient_clipping:
            for param in self._policy_network.parameters():
                param.grad.data.clamp_(-1, 1)
        
        # updates the parameters
        self._optimizer.step()
        
        # periodically update the network
        if (self._num_steps // self._update_frequency) % self._sync_target_step == 0:
            model_parameters = self._policy_network.state_dict()
            # noinspection PyTypeChecker
            self._target_network.load_state_dict(model_parameters)
        
        if self._logger is not None:
            data = {'loss': loss}
            self._logger.write(data, self._num_steps)
    
    @property
    def _exploration_rate(self):
        """Exploration rate (epsilon) which decays linearly during training
        """
        if self._training:
            time_diff = (self._epsilon_end_step - max(0, self._num_steps - self._replay_start_size))
            
            epsilon = self._epsilon_end + max(0., (self._epsilon_diff * time_diff) / self._epsilon_end_step)
        else:
            epsilon = self._epsilon_testing
        
        self._epsilon = epsilon
        return epsilon
    
    def training(self):
        """Changes the agent mode to train
        """
        self._training = True
        
    def testing(self):
        """Changes the agent mode to test
        """
        self._training = False
예제 #5
0
class Brain:
    def __init__(self, cfg, tetris):
        self.num_actions = cfg.MODEL.SIZE_ACTION
        self.gamma = cfg.SOLVER.GAMMA
        self.BATCH_SIZE = cfg.SOLVER.BATCH_SIZE

        transition = namedtuple('Transicion',
                                ('state', 'action', 'next_state', 'reward'))
        self.memory = ReplayMemory(cfg.SOLVER.CAPACITY, transition)
        self.model = get_model(cfg)

        self.target_net = copy.deepcopy(self.model)
        self.target_net.load_state_dict(self.model.state_dict())

        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

        self.tetris = tetris

    def replay(self):

        if len(self.memory) < self.BATCH_SIZE:
            return

        transitions = self.memory.sample(BATCH_SIZE)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        non_final_next_states = torch.cat(
            [s for s in batch.next_state if s is not None])

        self.model.eval()

        state_action_values = self.model(state_batch).gather(1, action_batch)

        non_final_mask = torch.ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))

        next_state_values = torch.zeros(BATCH_SIZE)

        self.target_net.eval()
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch

        self.model.train()

        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target_model(self):
        self.target_net.load_state_dict(self.model.state_dict())

    def decide_action(self, state, mino, episode):
        epsilon = 0.41 * (1 / (episode + 1))

        if epsilon <= np.random.uniform(0, 1):
            self.model.eval()
            with torch.no_grad():
                action = self.tetris.get_masked_action(self.model(state), mino)
        else:
            action = torch.LongTensor(
                [[self.tetris.get_random_masked_action(mino)]])

        return action

    def brain_predict(self, state):
        self.model.eval()
        with torch.no_grad():
            action = self.model(state).max(1)[1].view(1, 1)
        return action
예제 #6
0
def main():
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env', default='Breakout-v0', help='Atari env name')
    parser.add_argument(
        '-o', '--output', default='atari-v0', help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')

    parser.add_argument('--input_shape', nargs=2, type=int, default=None,
                        help='Input shape')
    parser.add_argument('--num_frame', default=4, type=int,
                        help='Number of frames in a state')
    parser.add_argument('--discount', default=0.99, type=float,
                        help='Discount factor gamma')

    parser.add_argument('--online_train_interval', default=4, type=int,
                        help='Interval to train the online network')
    parser.add_argument('--target_reset_interval', default=10000, type=int,
                        help='Interval to reset the target network')
    parser.add_argument('--action_change_interval', default=1, type=int,
                        help='Interval to change action')
    parser.add_argument('--print_loss_interval', default=100, type=int,
                        help='Interval to print losses')

    parser.add_argument('--replay_buffer_size', default=100000, type=int,
                        help='Replay buffer size')
    parser.add_argument('--num_burn_in', default=25000, type=int,
                        help='Number of samples filled in memory before update')
    parser.add_argument('--batch_size', default=32, type=int,
                        help='How many samples in each minibatch')

    parser.add_argument('--learning_rate', default=1e-4, type=float,
                        help='Learning rate alpha')
    parser.add_argument('--explore_prob', default=0.05, type=float,
                        help='Exploration probability in epsilon-greedy')
    parser.add_argument('--decay_prob_start', default=1.0, type=float,
                        help='Starting probability in linear-decay epsilon-greedy')
    parser.add_argument('--decay_prob_end', default=0.1, type=float,
                        help='Ending probability in linear-decay epsilon-greedy')
    parser.add_argument('--decay_steps', default=1000000, type=int,
                        help='Decay steps in linear-decay epsilon-greedy')

    parser.add_argument('--num_train', default=5000000, type=int,
                        help='Number of training sampled interactions with the environment')
    parser.add_argument('--max_episode_length', default=999999, type=int,
                        help='Maximum length of an episode')
    parser.add_argument('--save_interval', default=100000, type=int,
                        help='Interval to save weights and memory')

    parser.add_argument('--model_name', default='dqn', type=str,
                        help='Model name')

    parser.add_argument('--eval_interval', default=10000, type=int,
                        help='Evaluation interval')
    parser.add_argument('--eval_episodes', default=20, type=int,
                        help='Number of episodes in evaluation')

    parser.add_argument('--double_q', default=False, type=bool,
                        help='Invoke double Q net')

    parser.add_argument('--do_render', default=False, type=bool,
                        help='Do rendering or not')

    parser.add_argument('--read_weights', default=None, type=str,
                        help='Read weights from file')
    parser.add_argument('--read_memory', default=None, type=str,
                        help='Read memory from file')

    args = parser.parse_args()
    print '########## All arguments ##########:', args
    args.input_shape = tuple(args.input_shape)
    args.output = get_output_folder(args.output, args.env)

    env = gym.make(args.env)
    num_actions = env.action_space.n
    opt_adam = Adam(lr=args.learning_rate)

    model_online = create_model(args.num_frame, args.input_shape,
        num_actions, model_name=args.model_name)
    model_target = create_model(args.num_frame, args.input_shape,
        num_actions, model_name=args.model_name)

    q_network = {'online': model_online, 'target': model_target}

    preproc = AtariPreprocessor(args.input_shape)
    memory = ReplayMemory(args.replay_buffer_size, args.num_frame)

    policy_random = UniformRandomPolicy(num_actions)
    policy_train = LinearDecayGreedyEpsilonPolicy(args.decay_prob_start,
                                                  args.decay_prob_end,
                                                  args.decay_steps)
    policy_eval = GreedyEpsilonPolicy(args.explore_prob)
    policy = {'random': policy_random, 'train': policy_train, 'eval': policy_eval}

    agent = DQNAgent(num_actions, q_network, preproc, memory, policy, args)
    agent.compile([mean_huber_loss, null_loss], opt_adam)

    if args.read_weights is not None:
        agent.q_network['online'].load_weights(args.read_weights)
    if args.read_memory is not None:
        with open(args.read_memory, 'rb') as save_memory:
            agent.memory = pickle.load(save_memory)

    print '########## training #############'
    agent.fit(env)
예제 #7
0
# helper method for reshaping the cartpole observation
def reshape(state):
    return np.reshape(state, [1, 4])


if __name__ == '__main__':
    tf.compat.v1.disable_eager_execution()
    max_score = 0

    n_episodes = 5000
    max_env_steps = 1000

    env = gym.make('CartPole-v0')
    agent = DQNAgent(env=env,
                     net=NN(alpha=0.001, decay=0.0001),
                     memory=ReplayMemory(size=100000))

    if max_env_steps is not None:
        env._max_episode_steps = max_env_steps

    for e in range(n_episodes):
        # reset the env
        state = reshape(env.reset())
        done = False
        score = 0
        # play until env done
        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            # env.render()
            next_state = reshape(next_state)
예제 #8
0
import gym

from dqn.bots import AtariBot
from dqn.policy import DDQNPolicy
from dqn.memory import ReplayMemory

GAME = 'Breakout-v0'

#TODO List params to tune here, eventually migrate this to a readme

if __name__ == "__main__":
    policy = DDQNPolicy()
    memory = ReplayMemory()
    game = gym.make(GAME)
    game.ale.setInt(b'frame_skip', 4)
    robot = AtariBot(policy=policy, memory=memory)
    robot.train(game=game, ckpt_dir="models")
예제 #9
0
class NeuralQLearner:

    def __init__(self, settings):

        assert(type(settings)==dict)

        # seed
        self.rng = settings['RNG']

        # Epsilon
        self.epsilon_start = settings['EPSILON_START']
        self.epsilon_end = settings['EPSILON_END']
        self.epsilon_end_time = settings['EPSILON_END_TIME']
        self.testing_epsilon = settings['TESTING_EPSILON']
        self.epsilon_decay = (self.epsilon_start-self.epsilon_end)/float(self.epsilon_end_time)

        # Training
        self.learning_rate = settings['LEARNING_RATE']
        self.rmsprop_rho = settings['RMSPROP_RHO']
        self.rmsprop_epsilon = settings['RMSPROP_EPSILON']
        self.target_net_update = settings['TARGET_NET_UPDATE']
        self.min_reward = settings['MIN_REWARD']
        self.max_reward = settings['MAX_REWARD']

        # Q-Learning Parameters
        self.n_actions = settings['N_ACTIONS']
        self.discount_factor = settings['DISCOUNT_FACTOR']
        self.update_frequency = settings['UPDATE_FREQUENCY']
        self.learn_start = settings['LEARN_START']
        self.agent_history_length = settings['AGENT_HISTORY_LENGTH']
        self.batch_size = settings['BATCH_SIZE']

        # Preprocess
        self.resize_width = settings['RESIZE_WIDTH']
        self.resize_height = settings['RESIZE_HEIGHT']
        self.resize_dims = (self.resize_width, self.resize_height)

        self.net = DeepQNetwork(
            self.n_actions,
            self.agent_history_length,
            self.resize_height,
            self.resize_width
        )
        self.target_net = DeepQNetwork(
            self.n_actions,
            self.agent_history_length,
            self.resize_height,
            self.resize_width
        )
        self.target_net.setWeights(self.net.getWeights())

        self.memory = ReplayMemory(settings)

        self.numSteps = 0
        self.lastState = None
        self.lastAction = None
        self.lastTerminal = None

        self.compile()

    def compile(self):

        input_shape = (
            self.batch_size,
            self.agent_history_length,
            self.resize_height,
            self.resize_width
        )

        pred_input_shape = (
            1,
            self.agent_history_length,
            self.resize_height,
            self.resize_width
        )

        self.pred_input = shared(np.zeros(pred_input_shape, dtype=floatX))
        self.net_input = shared(np.zeros(input_shape, dtype=floatX))
        self.target_net_input = shared(np.zeros(input_shape, dtype=floatX))

        self.shared_actions = shared(np.zeros((self.batch_size,), dtype='int32'))
        self.shared_rewards = shared(np.zeros((self.batch_size,), dtype='int32'))
        self.shared_terminals = shared(np.zeros((self.batch_size,), dtype=floatX))

        actions = T.ivector()
        rewards = T.ivector()
        terminals = T.vector()

        targets = (rewards +
                    (T.ones_like(terminals) - terminals) *
                                self.discount_factor * T.max(self.target_net.qvalues, axis=1))

        diff = targets - self.net.qvalues[T.arange(self.batch_size), actions]

        qp = T.minimum(abs(diff), 1.0)
        lp = abs(diff) - qp
        delta = 0.5 * qp ** 2 + lp
        cost = T.sum(delta)

        optimizer = RMSprop(
            cost,
            self.net.params,
            lr=self.learning_rate,
            rho=self.rmsprop_rho,
            epsilon=self.rmsprop_epsilon
        )

        givens = {
            self.net.input: self.net_input,
            self.target_net.input: self.target_net_input,
            actions: self.shared_actions,
            rewards: self.shared_rewards,
            terminals: self.shared_terminals
        }

        self.train = function(
            inputs=[],
            outputs=cost,
            updates=optimizer.getUpdates(),
            givens=givens
        )

        self.prediction = function(
            inputs=[],
            outputs=self.net.qvalues.flatten(1),
            givens={
                self.net.input: self.pred_input
            }
        )

    def preprocess(self, rawstate):

        return cv2.resize(rawstate, self.resize_dims, interpolation=cv2.INTER_LINEAR)

    def getEpsilon(self):

        current_epsilon = self.epsilon_start - (self.numSteps * self.epsilon_decay)
        return max(self.epsilon_end, current_epsilon)

    def qLearnMinibatch(self):

        s1, a, r, t, s2 = self.memory.sampleMinibatch()
        # borrow=True para que no se haga una copia del arreglo y sea mas rapido
        self.net_input.set_value(s1, borrow=True)
        self.shared_actions.set_value(a, borrow=True)
        self.shared_rewards.set_value(r, borrow=True)
        self.shared_terminals.set_value(t, borrow=True)
        self.target_net_input.set_value(s2, borrow=True)
        return self.train()

    def perceive(self, rawstate, reward, terminal, testing):

        state = self.preprocess(rawstate)
        reward = max(reward, self.min_reward)
        reward = min(reward, self.max_reward)

        self.memory.storeRecentState(state, terminal)

        if((not testing) and (self.lastState is not None)):
            self.memory.storeTransition(self.lastState, self.lastAction, reward, self.lastTerminal)

        actionIndex = 0
        if(not terminal):
            actionIndex = self.eGreedy(testing)

        flag1 = (self.numSteps > self.learn_start)
        flag2 = (self.numSteps % self.update_frequency == 0)

        # Short-Circuit-Eval ...
        if((not testing) and flag1 and flag2):
            cost = self.qLearnMinibatch()

        if(self.numSteps % self.target_net_update == 0):
            self.target_net.setWeights(self.net.getWeights())

        self.lastState = state
        self.lastAction = actionIndex
        self.lastTerminal = terminal

        if(not testing):
            self.numSteps += 1

        return actionIndex

    def eGreedy(self, testing):

        epsilon = self.testing_epsilon if(testing) else self.getEpsilon()
        if(self.rng.uniform(0,1) < epsilon):
            return self.rng.randint(0, self.n_actions)
        else:
            return self.greedy()

    def greedy(self):

        curState = self.memory.getRecentState()
        curState = curState.reshape(1, curState.shape[0], curState.shape[1], curState.shape[2])
        self.pred_input.set_value(curState, borrow=True)
        q = self.prediction()

        maxq = q[0]
        besta = [0]
        for a in xrange(1, self.n_actions):
            if(q[a] > maxq):
                maxq = q[a]
                besta = [a]
            elif(q[a] == maxq):
                besta.append(a)

        r = self.rng.randint(0, len(besta))
        return besta[r]