# AGENT selector = actions.ArgmaxActionSelector() agent = agents.DQNAgent(net, selector, device=device) # RUNNER exp_source = runner.RunnerSourceFirstLast( env, agent, gamma=params["gamma"]) # increase the number of steps for the runner buffer = ExperienceReplayBuffer(exp_source, buffer_size=params["replay_size"]) optimizer = optim.Adam(net.parameters(), lr=params["learning_rate"]) frame_idx = 0 # TRAIN with logger.RewardTracker(writer, params["stop_reward"]) as reward_tracker: while True: frame_idx += 1 buffer.populate(1) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx): break if len(buffer) < params["replay_initial"]: continue # learning step optimizer.zero_grad() batch = buffer.sample(params["batch_size"])
net.share_memory() optimizer = optim.Adam(net.parameters(), lr=params["learning_rate"], eps=1e-3) # initialise processes train_queue, data_proc_list = init_procs(envs, params) batch = [] step_idx = 0 # TRAINING try: with logger.RewardTracker(net, writer, stop_reward=params["stop_reward"], tag="a3c") as tracker: while True: train_entry = train_queue.get() if isinstance(train_entry, TotalReward): if tracker.reward(train_entry.reward, step_idx): break continue step_idx += 1 batch.append(train_entry) if len(batch) < params["batch_size"]: continue loss_policy, loss_v = calc_a2c_loss(batch, net, params)
tgt_crt_net = ptan.agent.TargetNet(crt_net) writer = SummaryWriter(comment="-ddpg_" + args.name) agent = agents.AgentDDPG(act_net, device=device) exp_source = runner.RunnerSourceFirstLast(env, agent, gamma=GAMMA, steps_count=1) buffer = memory.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE) act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE) crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE) frame_idx = 0 best_reward = None with logger.RewardTracker(act_net, writer, 200) as tracker: with ptan.common.utils.TBMeanTracker(writer, batch_size=10) as tb_tracker: while True: frame_idx += 1 buffer.populate(1) rewards_steps = exp_source.pop_rewards_steps() if rewards_steps: rewards, steps = zip(*rewards_steps) tb_tracker.track("episode_steps", steps[0], frame_idx) tracker.reward(rewards[0], frame_idx) if len(buffer) < REPLAY_INITIAL: continue batch = buffer.sample(BATCH_SIZE)
apply_softmax=True, device=device) # RUNNER exp_source = runner.RunnerSourceFirstLast(envs, agent, gamma=params["gamma"], steps_count=params["step_count"]) optimizer = optim.Adam(net.parameters(), lr=params["learning_rate"], eps=1e-3) batch = [] # TRAINING with logger.RewardTracker(net, writer, stop_reward=195, tag="a2c") as tracker: for step_idx, exp in enumerate(exp_source): batch.append(exp) # handle new rewards new_rewards = exp_source.pop_total_rewards() if new_rewards: if tracker.reward(new_rewards[0], step_idx): break if len(batch) < params["batch_size"]: continue loss_policy, loss_v = calc_a2c_loss(batch, net, params) batch.clear()