with open('q_vals.pickle', 'rb') as handle: self.Q_expert = pickle.load(handle) print(self.Q_expert) if __name__ == '__main__': # DiscretisedEnv env = DiscretisedEnv(gym.make('CartPole-v0')) # hyperparameters n_episodes = 1000 goal_duration = 150 decay_steps = 5000 all_rewards = list() durations = collections.deque(maxlen=100) Epsilon = AnnealingSchedule(start=1.0, end=0.01, decay_steps=decay_steps) Alpha = AnnealingSchedule(start=1.0, end=0.01, decay_steps=decay_steps) agent = Q_Agent(env) agent.load_Q() global_timestep = tf.train.get_or_create_global_step() for episode in range(n_episodes): current_state = env.reset() done = False duration = 0 # one episode of q learning while not done: # env.render() duration += 1
default=100, type=int, help="game env type") args = parser.parse_args() if args.mode == "CartPole": env = MyWrapper(gym.make("CartPole-v0")) elif args.mode == "Atari": env = wrap_deepmind(make_atari("PongNoFrameskip-v4")) params = Parameters(algo="DQfD", mode=args.mode) params.num_episodes = args.num_episodes replay_buffer = PrioritizedReplayBuffer( params.memory_size, alpha=params.prioritized_replay_alpha) Beta = AnnealingSchedule(start=params.prioritized_replay_beta_start, end=params.prioritized_replay_beta_end, decay_steps=params.decay_steps) agent = DQfD(args.mode, Model, Model, env.action_space.n, params, logdirs.model_DQN) if params.policy_fn == "Eps": Epsilon = AnnealingSchedule(start=params.epsilon_start, end=params.epsilon_end, decay_steps=params.decay_steps) policy = EpsilonGreedyPolicy_eager(Epsilon_fn=Epsilon) elif params.policy_fn == "Boltzmann": policy = BoltzmannQPolicy_eager() reward_buffer = deque(maxlen=params.reward_buffer_ep) summary_writer = tf.contrib.summary.create_file_writer(logdirs.log_DQfD) expert = DQN(args.mode, Model_CartPole_DQN, Model_CartPole_DQN,
parser.add_argument("--google_colab", default=False, type=bool, help="if you are executing this on GoogleColab") params = parser.parse_args() params.goal = 195 params.test_episodes = 10 params.prioritized_replay_alpha = 0.6 params.prioritized_replay_beta_start = 0.4 params.prioritized_replay_beta_end = 1.0 params.prioritized_replay_noise = 1e-6 replay_buffer = PrioritizedReplayBuffer(params.memory_size, alpha=params.prioritized_replay_alpha) Beta = AnnealingSchedule(start=params.prioritized_replay_beta_start, end=params.prioritized_replay_beta_end, decay_steps=params.decay_steps) Epsilon = AnnealingSchedule(start=params.epsilon_start, end=params.epsilon_end, decay_steps=params.decay_steps) policy = EpsilonGreedyPolicy_eager(Epsilon_fn=Epsilon) Epsilon = AnnealingSchedule(start=params.epsilon_start, end=params.epsilon_end, decay_steps=params.decay_steps) reward_buffer = deque(maxlen=params.reward_buffer_ep) anneal_lr = AnnealingSchedule(start=0.0025, end=0.00025, decay_steps=params.decay_steps, decay_type="linear") optimizer = tf.train.RMSPropOptimizer(anneal_lr.get_value(), 0.99, 0.0, 1e-6)
if params.google_colab: # mount your drive on google colab from google.colab import drive drive.mount("/content/gdrive") params.log_dir = "/content/gdrive/My Drive/logs/logs/DQN/{}".format( params.env_name) params.model_dir = "/content/gdrive/My Drive/logs/models/DQN/{}".format( params.env_name) os.makedirs(params.log_dir) os.makedirs(params.model_dir) assert os.path.isdir( params.log_dir ), "Faild to create a directory on your My Drive, pls check it" assert os.path.isdir( params.model_dir ), "Faild to create a directory on your My Drive, pls check it" agent = DQN(params, env.action_space.n) else: agent = DQN(params, env.action_space.n) Epsilon = AnnealingSchedule(start=params.epsilon_start, end=params.epsilon_end, decay_steps=params.decay_steps) policy = EpsilonGreedyPolicy(Epsilon_fn=Epsilon) replay_buffer = ReplayBuffer(params.memory_size) reward_buffer = deque(maxlen=params.reward_buffer_ep) summary_writer = tf.contrib.summary.create_file_writer(params.log_dir) train_DQN(agent, env, policy, replay_buffer, reward_buffer, params, summary_writer)
# apply processed gradients to the network self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights)) return loss, batch_loss if __name__ == '__main__': env = gym.make('MountainCarContinuous-v0') # hyperparameters all_rewards = list() params = Parameters(algo="DQN", mode="CartPole") Epsilon = AnnealingSchedule(start=params.epsilon_start, end=params.epsilon_end, decay_steps=params.decay_steps) Alpha = AnnealingSchedule(start=params.epsilon_start, end=params.epsilon_end, decay_steps=params.decay_steps) agent = Continuous_Q_Agent(env, params) global_step = 0 for episode in range(params.num_episodes): state = env.reset() episode_loss = 0 episode_reward = 0 for t in itertools.count(): # env.render()
from tf_rl.common.utils import AnnealingSchedule from tf_rl.common.params import Parameters from tf_rl.common.policy import EpsilonGreedyPolicy params = Parameters("CartPole") Epsilon = AnnealingSchedule(start=params.epsilon_start, end=params.epsilon_end, decay_steps=params.decay_steps) policy = EpsilonGreedyPolicy(Epsilon_fn=Epsilon) num_episodes = 80 for ep in range(num_episodes): print(Epsilon.get_value(ep)) policy.index_episode = ep print(policy.current_epsilon())