def play_func(params, net, cuda, exp_queue): env = make(params.env_name) env = wrap_dqn(env) env.seed(common.SEED) device = torch.device("cuda" if cuda else "cpu") selector = EpsilonGreedyActionSelector(epsilon=params.epsilon_start) epsilon_tracker = common.EpsilonTracker(selector, params) agent = DQNAgent(net, selector, device=device) exp_source = ExperienceSourceFirstLast(env, agent, gamma=params.gamma) for frame_idx, exp in enumerate(exp_source): epsilon_tracker.frame(frame_idx / BATCH_MUL) exp_queue.put(exp) for reward, steps in exp_source.pop_rewards_steps(): exp_queue.put(EpisodeEnded(reward, steps, selector.epsilon))
type=int, default=DEFAULT_N_STEPS, help="steps to do on Bellman unroll") args = parser.parse_args() device = device("cuda" if args.cuda else "cpu") env = make(params.env_name) env = wrap_dqn(env) env.seed(123) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = TargetNet(net) selector = EpsilonGreedyActionSelector(epsilon=params.epsilon_start) epsilon_tracker = common.EpsilonTracker(selector, params) agent = DQNAgent(net, selector, device=device) exp_source = ExperienceSourceFirstLast(env, agent, gamma=params.gamma, steps_count=args.n) buffer = ExperienceReplayBuffer(exp_source, buffer_size=params.replay_size) optimizer = Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine, batch): optimizer.zero_grad() loss = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params.gamma**args.n, device=device) loss.backward()
help='Enter the number of steps to unroll bellman eq') args = parser.parse_args() print('Starting...') params = HYPERPARAMS['pong'] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print('Running on Device {}'.format(device)) writer = writer = SummaryWriter(comment="-" + params['run_name'] + "-%d-step noisy-net" % args.n) env = gym.make(params['env_name']) env = wrappers.wrap_dqn(env) # print(env.observation_space.shape, env.action_space.n) net = NoisyDQN(env.observation_space.shape, env.action_space.n).to(device) target_net = TargetNet(net) agent = DQNAgent(net, ArgmaxActionSelector(), device) experience_source = ExperienceSourceFirstLast(env, agent, params['gamma'], steps_count=args.n) buffer = ExperienceReplayBuffer(experience_source, buffer_size=params['replay_size']) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0 with RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 buffer.populate(1) # get latest rewards
default=True, action="store_true", help="Enable cuda") args = parser.parse_args() device = device("cuda" if args.cuda else "cpu") env = make(params.env_name) env = wrap_dqn(env) env.seed(123) net = dqn_extra.DistributionalDQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = TargetNet(net) selector = EpsilonGreedyActionSelector(epsilon=params.epsilon_start) epsilon_tracker = common.EpsilonTracker(selector, params) agent = DQNAgent(lambda x: net.qvals(x), selector, device=device) exp_source = ExperienceSourceFirstLast(env, agent, gamma=params.gamma) buffer = ExperienceReplayBuffer(exp_source, buffer_size=params.replay_size) optimizer = Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine, batch): optimizer.zero_grad() loss = calc_loss(batch, net, tgt_net.target_model, gamma=params.gamma, device=device) loss.backward() optimizer.step() epsilon_tracker.frame(engine.state.iteration) if engine.state.iteration % params.target_net_sync == 0:
last_states_q_v = net(last_states_v) best_last_q_v = last_states_q_v.max(dim=1)[0] best_last_q_v[done_masks] = 0.0 return states_v, actions_v, best_last_q_v * gamma + rewards_v if __name__ == "__main__": env = make("CartPole-v0") obs_size = env.observation_space.shape[0] n_actions = env.action_space.n net = Net(obs_size, HIDDEN_SIZE, n_actions) tgt_net = TargetNet(net) selector = ArgmaxActionSelector() selector = EpsilonGreedyActionSelector(epsilon=1, selector=selector) agent = DQNAgent(net, selector) exp_source = ExperienceSourceFirstLast(env, agent, gamma=GAMMA) buffer = ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE) optimizer = Adam(net.parameters(), LR) step = episode = 0 solved = False while True: step += 1 buffer.populate(1) for reward, steps in exp_source.pop_rewards_steps(): episode += 1 print(f"{step}: episode {episode} done, {reward=:.3f}, epsilon={selector.epsilon:.2f}") solved = reward > 150 if solved: print("YAY!")