def __init__(self, DQN, parameters=DQNParameters()): """ DQN: The DQN used to estimate the reward parameters: The parameters! """ self.on_loss_computed = Signal() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.DQN = DQN.to(self.device).train() self.frozen_DQN = copy.deepcopy(self.DQN).eval() for param in self.frozen_DQN.parameters(): param.requires_grad = False self._update_frozen() self.memory = PrioritizedMemory(parameters.capacity) self.optimizer = optim.RMSprop(self.DQN.parameters(), lr=parameters.lr) self.parameters = parameters self.it_s_replay_time = generator_true_every(1) self.it_s_update_frozen_time = generator_true_every( self.parameters.frozen_steps) self.it_s_action_debug_time = generator_true_every(1000)
def test_memory(): memory = PrioritizedMemory(10) memory.add(15, 1, 2, 3, 4, 5) memory.add(10, 4, 5, 6, 5, 2) indexes, transitions = zip(*memory.sample(2)) assert indexes == (9, 10) assert transitions == (Transition(state=1, action=2, reward=3, next_state=4, terminal=5), Transition(state=4, action=5, reward=6, next_state=5, terminal=2)) """ Example of batch creation """ assert Transition(*zip(*transitions)) == Transition(state=(1, 4), action=(2, 5), reward=(3, 6), next_state=(4, 5), terminal=(5, 2))
from rl.layers import NoisyNetDense INPUT_SHAPE = (84, 84) WINDOW_LENGTH = 4 env = gym.make('MsPacmanDeterministic-v4') np.random.seed(231) env.seed(231) nb_actions = env.action_space.n input_shape = (WINDOW_LENGTH, INPUT_SHAPE[0], INPUT_SHAPE[1]) agent = NoisyDQN(input_shape, nb_actions) model = agent.model memory = PrioritizedMemory(limit=1000000, alpha=.6, start_beta=.4, end_beta=1., steps_annealed=30000000, window_length=WINDOW_LENGTH) processor = AtariProcessor() policy = GreedyQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, enable_double_dqn=True, enable_dueling_network=True, nb_steps_warmup=50000, gamma=.99, target_model_update=10000,
class DoubleDQN(): """ From Deep Reinforcement Learning with Double Q-learning at https://arxiv.org/abs/1509.06461 """ def __init__(self, DQN, parameters=DQNParameters()): """ DQN: The DQN used to estimate the reward parameters: The parameters! """ self.on_loss_computed = Signal() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.DQN = DQN.to(self.device).train() self.frozen_DQN = copy.deepcopy(self.DQN).eval() for param in self.frozen_DQN.parameters(): param.requires_grad = False self._update_frozen() self.memory = PrioritizedMemory(parameters.capacity) self.optimizer = optim.RMSprop(self.DQN.parameters(), lr=parameters.lr) self.parameters = parameters self.it_s_replay_time = generator_true_every(1) self.it_s_update_frozen_time = generator_true_every( self.parameters.frozen_steps) self.it_s_action_debug_time = generator_true_every(1000) def _update_frozen(self): """ Let it go, let it go I am one with the wind and sky Let it go, let it go You'll never see me cry Here I stand and here I stay Let the storm rage on """ self.frozen_DQN.load_state_dict(self.DQN.state_dict()) def select_action(self, state): """ Return the selected action """ with torch.no_grad(): values = self.DQN(torch.FloatTensor([state]).to( self.device)).cpu().data.numpy()[0] if len(self.memory) > self.parameters.waiting_time: selected_action = numpy.argmax(values) if next(self.it_s_action_debug_time): print(selected_action, values) else: selected_action = numpy.random.randint(len(values)) return selected_action def observe(self, state, action, reward, next_state, is_terminal): """ Observe an experience tuple (state, action, reward, next_state, is_terminal) """ if self.parameters.clipping is not None: # Clip the reward reward = numpy.clip(reward, -self.parameters.clipping, self.parameters.clipping) self.memory.add(10, state, action, reward, next_state, is_terminal) if next(self.it_s_update_frozen_time): self._update_frozen() if next(self.it_s_replay_time) and len( self.memory) > self.parameters.waiting_time: self._replay() def train(self): self.DQN.train() def eval(self): self.DQN.eval() def save(self): self.DQN.save_state_dict("model.torch") def _replay(self): """ Learn things """ indexes, transitions = zip( *self.memory.sample(self.parameters.batch_size)) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) state_values = self.DQN(\ torch.FloatTensor(batch.state).to(self.device),\ torch.LongTensor(batch.action).to(self.device).unsqueeze(1)\ ) with torch.no_grad(): expected_state_values = torch.FloatTensor(batch.reward).to(self.device).unsqueeze(1)\ + self.parameters.gamma ** self.memory.n_step * self.DQN(torch.FloatTensor(batch.next_state).to(self.device)).max(1, True)[0]*(1 - torch.FloatTensor(batch.terminal).to(self.device).unsqueeze(1)) loss = F.mse_loss(state_values, expected_state_values) # MSE Loss self.on_loss_computed.emit( loss.cpu().data.numpy()) # Emit the computed loss self.optimizer.zero_grad() loss.backward() for param in self.DQN.parameters(): if hasattr(param, "grad") and hasattr(param.grad, "data"): param.grad.data.clamp_(-1, 1) self.optimizer.step()
model.add(Activation('relu')) model.add(Convolution2D(64, 4, 4, subsample=(2, 2))) model.add(Activation('relu')) model.add(Convolution2D(64, 3, 3, subsample=(1, 1))) model.add(Activation('relu')) model.add(Flatten()) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = PrioritizedMemory(limit=100000, error=0.01, alfa=0.6, window_length=WINDOW_LENGTH) processor = SpectrumProcessor() # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000)
def _build_dqn_agent(self, params): NB_ACTIONS = 7 # ---------------------------------------------------------------------------------------------------------------- inputShape = (params['width'], params['height'], 3) model = Sequential() model.add( Conv2D(16, (3, 3), input_shape=inputShape, padding='same', activation='relu')) model.add(Conv2D(32, (3, 3), padding='same', activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2), padding='same')) model.add(NoisyNetDense(16, activation='linear')) model.add(Flatten()) model.add(NoisyNetDense(NB_ACTIONS, activation='linear')) model.summary() # ---------------------------------------------------------------------------------------------------------------- # Memory replay if not params['prio_memory']: print("Using Sequential memory") memory = SequentialMemory(limit=params['mem_size'], window_length=1) else: print("Using Prioritized memory") params['lr'] = params['lr'] / 4 memory = PrioritizedMemory(limit=params['mem_size'], alpha=0.6, start_beta=0.5, end_beta=1.0, steps_annealed=params['annealing'], window_length=1) # Epsilon Greedy policy, linearly decreasing if not params['noisy_layer']: print("Using Annealed Eps Greedy policy") self.policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=params['eps'], value_min=params['eps_final'], value_test=0.0, nb_steps=params['annealing']) # Or Greedy policy in case of noisy layers else: print("Using Q Greedy policy (with noisy layer)") self.policy = GreedyQPolicy() # Keras DQN agent self._dqn = DQNAgent( model=model, nb_actions=NB_ACTIONS, policy=self.policy, memory=memory, batch_size=params['batch_size'], processor=WindowProcessor(), enable_double_dqn=True, enable_dueling_network=True, nb_steps_warmup=params['train_start'], gamma=params['discount'], target_model_update=1000, train_interval=1, delta_clip=1., custom_model_objects={"NoisyNetDense": NoisyNetDense}) self._dqn.compile(Adam(lr=params['lr']), metrics=['mae']) if params['load_file']: print("file loaded") self._dqn.load_weights(params['load_file'])