# Here we combine the same improvements from Rainbow, but use QR instead of C51 # Note that we are still using a DistributionalQNetwork, but this network uses n as the number of quantiles rather than the number of atoms q_func = nn.DistributionalQNetwork([64], env.action_space.n, n=75, noisy_net=True, dueling=[32]) epsilon_scheduler = dqn.annealing_schedules.Constant(0) action_selection = dqn.algorithms.EpsilonGreedy(epsilon_scheduler) loss = dqn.algorithms.QuantileRegressionLoss() update_target = dqn.algorithms.HardUpdate() alpha_scheduler = dqn.annealing_schedules.Constant(0.7) beta_scheduler = dqn.annealing_schedules.Constant(0.5) memory = dqn.experience_replay.Proportional(capacity=100000, alpha_scheduler=alpha_scheduler, beta_scheduler=beta_scheduler) agent = DQNAgent(network=q_func, observation_space=env.observation_space, action_space=env.action_space, action_selection=action_selection, loss=loss, update_target=update_target, memory=memory, n_step=3, update_target_network_frequency=100) agent.train(env, num_timesteps=num_steps, render=False) agent.save('save/qr_dqn')
action = agent.get_action(state) # Interact with the environment and observe new state and reward next_state, reward, terminated, info = environment.step(action) # Huge negative reward if failed if terminated: reward = -100 # Remember agent's experience: state / action / reward / next state next_state = np.reshape(next_state, [1, n_state_features]) agent.remember(state, action, reward, next_state, terminated) # Change the current state state = next_state # Print statistics if agent failed and quit inner loop if terminated: print( f'Episode: {episode} of {EPISODES} (score: {t}s, exploration rate: {agent.epsilon:.4})' ) break # Re-train Value Function Approximation model if we have enough examples in memory if len(agent.memory) >= BATCH_SIZE: agent.experience_replay(BATCH_SIZE) # Save trained agent every once in a while if episode % 100 == 0: agent.save(f'./models/{environment_name}.h5')
env.action_space.n, n=51, noisy_net=True, dueling=[32]) # Action selection in Rainbow is done using noisy nets with no epsilon epsilon_scheduler = dqn.annealing_schedules.Constant(0) action_selection = dqn.algorithms.EpsilonGreedy(epsilon_scheduler) loss = dqn.algorithms.CategoricalAlgorithm(double_q=True) update_target = dqn.algorithms.HardUpdate() alpha_scheduler = dqn.annealing_schedules.Constant(0.7) beta_scheduler = dqn.annealing_schedules.Constant(0.5) memory = dqn.experience_replay.Proportional(capacity=100000, alpha_scheduler=alpha_scheduler, beta_scheduler=beta_scheduler) agent = DQNAgent(network=q_func, observation_space=env.observation_space, action_space=env.action_space, action_selection=action_selection, loss=loss, update_target=update_target, memory=memory, n_step=3, update_target_network_frequency=200) agent.train(env, num_timesteps=num_steps, render=False) agent.save('save_test/rainbow')