def main(): NAME = "01_baseline" random.seed(common.SEED) torch.manual_seed(common.SEED) params = common.HYPERPARAMS["pong"] parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=True, action="store_true", help="Enable cuda") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") env = gym.make(params.env_name) env = ptan.common.wrappers.wrap_dqn(env) env.seed(common.SEED) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params.epsilon_start) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params.gamma) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params.replay_size) optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine_, batch): optimizer.zero_grad() loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params.gamma, device=device) loss_v.backward() optimizer.step() epsilon_tracker.frame(engine_.state.iteration) if engine_.state.iteration % params.target_net_sync == 0: tgt_net.sync() return { "loss": loss_v.item(), "epsilon": selector.epsilon, } engine = Engine(process_batch) common.setup_ignite(engine, params, exp_source, NAME) engine.run( common.batch_generator(buffer, params.replay_initial, params.batch_size))
def play_func(params, net, cuda, exp_queue): env = gym.make(params.env_name) env = ptan.common.wrappers.wrap_dqn(env) env.seed(common.SEED) device = torch.device("cuda" if cuda else "cpu") selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params.epsilon_start) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params.gamma) for frame_idx, exp in enumerate(exp_source): epsilon_tracker.frame(frame_idx / BATCH_MUL) exp_queue.put(exp) for reward, steps in exp_source.pop_rewards_steps(): exp_queue.put(EpisodeEnded(reward, steps, selector.epsilon))
reward_scale=params.counts_reward_scale, hash_function=counts_hash) env.seed(common.SEED) if args.params.startswith("egreedy") or args.params == "counts": net = dqn_extra.MountainCarBaseDQN(env.observation_space.shape[0], env.action_space.n) elif args.params == "noisynet": net = dqn_extra.MountainCarNoisyNetDQN(env.observation_space.shape[0], env.action_space.n) tgt_net = ptan.agent.TargetNet(net) print(net) if args.params.startswith("egreedy"): selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params.epsilon_start) epsilon_tracker = common.EpsilonTracker(selector, params) training_enabled = not params.eps_decay_trigger epsilon_tracker_frame = 0 else: selector = ptan.actions.ArgmaxActionSelector() training_enabled = True agent = ptan.agent.DQNAgent(net, selector, preprocessor=ptan.agent.float32_preprocessor) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params.gamma, steps_count=N_STEPS) buffer = ptan.experience.ExperienceReplayBuffer(
def main(): NAME = "03_double" STATES_TO_EVALUATE = 1000 EVAL_EVERY_FRAME = 100 random.seed(common.SEED) torch.manual_seed(common.SEED) params = common.HYPERPARAMS["pong"] parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") parser.add_argument("--double", default=False, action="store_true", help="Enable double dqn") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") env = gym.make(params.env_name) env = ptan.common.wrappers.wrap_dqn(env) env.seed(common.SEED) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params.epsilon_start) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params.gamma) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params.replay_size) optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine_, batch): optimizer.zero_grad() loss_v = calc_loss_double_dqn(batch, net, tgt_net.target_model, gamma=params.gamma, device=device, double=args.double) loss_v.backward() optimizer.step() epsilon_tracker.frame(engine_.state.iteration) if engine_.state.iteration % params.target_net_sync == 0: tgt_net.sync() if engine_.state.iteration % EVAL_EVERY_FRAME == 0: eval_states = getattr(engine_.state, "eval_states", None) if eval_states is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [ np.array(transition.state, copy=False) for transition in eval_states ] eval_states = np.array(eval_states, copy=False) engine_.state.eval_states = eval_states engine_.state.metrics["values"] = common.calc_values_of_states( eval_states, net, device) return { "loss": loss_v.item(), "epsilon": selector.epsilon, } engine = Engine(process_batch) common.setup_ignite(engine, params, exp_source, f"{NAME}={args.double}", extra_metrics=("values", )) engine.run( common.batch_generator(buffer, params.replay_initial, params.batch_size))
b_env = data.MAgentEnv(m_env, b_handle, reset_env_func=reset_env, is_slave=False, steps_limit=MAX_EPISODE) obs = data.MAgentEnv.handle_obs_space(m_env, a_handle) net = model.DQNModel(obs.spaces[0].shape, obs.spaces[1].shape, m_env.get_action_space(a_handle)[0]).to(device) tgt_net = ptan.agent.TargetNet(net) print(net) action_selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=PARAMS.epsilon_start) epsilon_tracker = common.EpsilonTracker(action_selector, PARAMS) preproc = model.MAgentPreprocessor(device) agent = ptan.agent.DQNAgent(net, action_selector, device, preprocessor=preproc) a_exp_source = ptan.experience.ExperienceSourceFirstLast(a_env, agent, PARAMS.gamma, vectorized=True) b_exp_source = ptan.experience.ExperienceSourceFirstLast(b_env, agent, PARAMS.gamma, vectorized=True) buffer = ptan.experience.ExperienceReplayBuffer(None, PARAMS.replay_size)