def play_func(params, net, cuda, exp_queue): env = make(params.env_name) env = wrap_dqn(env) env.seed(common.SEED) device = torch.device("cuda" if cuda else "cpu") selector = EpsilonGreedyActionSelector(epsilon=params.epsilon_start) epsilon_tracker = common.EpsilonTracker(selector, params) agent = DQNAgent(net, selector, device=device) exp_source = ExperienceSourceFirstLast(env, agent, gamma=params.gamma) for frame_idx, exp in enumerate(exp_source): epsilon_tracker.frame(frame_idx / BATCH_MUL) exp_queue.put(exp) for reward, steps in exp_source.pop_rewards_steps(): exp_queue.put(EpisodeEnded(reward, steps, selector.epsilon))
parser.add_argument("-n", type=int, default=DEFAULT_N_STEPS, help="steps to do on Bellman unroll") args = parser.parse_args() device = device("cuda" if args.cuda else "cpu") env = make(params.env_name) env = wrap_dqn(env) env.seed(123) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = TargetNet(net) selector = EpsilonGreedyActionSelector(epsilon=params.epsilon_start) epsilon_tracker = common.EpsilonTracker(selector, params) agent = DQNAgent(net, selector, device=device) exp_source = ExperienceSourceFirstLast(env, agent, gamma=params.gamma, steps_count=args.n) buffer = ExperienceReplayBuffer(exp_source, buffer_size=params.replay_size) optimizer = Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine, batch): optimizer.zero_grad() loss = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params.gamma**args.n, device=device)