def main(): env = KukaGymEnv(renders=True, isDiscrete=False, maxSteps=10000000) save_path = os.path.join("saves", "ddpg-") os.makedirs(save_path, exist_ok=True) device = torch.device("cuda") act_net = model.DDPGActor(env.observation_space.shape[0], env.action_space.shape[0]).to(device) crt_net = model.D4PGCritic(env.observation_space.shape[0], env.action_space.shape[0], N_ATOMS, Vmin, Vmax).to(device) print(act_net) print(crt_net) tgt_act_net = common.TargetNet(act_net) tgt_crt_net = common.TargetNet(crt_net) writer = SummaryWriter(comment="-d4pg_") agent = model.AgentDDPG(act_net, device=device) exp_source = experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA, steps_count=REWARD_STEPS) buffer = experience.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE) act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE) crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE) frame_idx = 0 best_reward = None with common.RewardTracker(writer) as tracker: with common.TBMeanTracker(writer, batch_size=10) as tb_tracker: while True: frame_idx += 1 #print("populate") buffer.populate(1) rewards_steps = exp_source.pop_rewards_steps() #print(rewards_steps) if rewards_steps: rewards, steps = zip(*rewards_steps) tb_tracker.track("episode_steps", steps[0], frame_idx) tracker.reward(rewards[0], frame_idx) if len(buffer) < 100: continue batch = buffer.sample(BATCH_SIZE) #print("infer") states_v, actions_v, rewards_v, dones_mask, last_states_v = common.unpack_batch_ddqn( batch, device) #print("train critic")# train critic crt_opt.zero_grad() crt_distr_v = crt_net(states_v, actions_v) last_act_v = tgt_act_net.target_model(last_states_v) last_distr_v = F.softmax(tgt_crt_net.target_model( last_states_v, last_act_v), dim=1) proj_distr_v = distr_projection(last_distr_v, rewards_v, dones_mask, gamma=GAMMA**REWARD_STEPS, device=device) prob_dist_v = -F.log_softmax(crt_distr_v, dim=1) * proj_distr_v critic_loss_v = prob_dist_v.sum(dim=1).mean() critic_loss_v.backward() crt_opt.step() tb_tracker.track("loss_critic", critic_loss_v, frame_idx) #print("train actor") # train actor act_opt.zero_grad() act_opt.zero_grad() cur_actions_v = act_net(states_v) crt_distr_v = crt_net(states_v, cur_actions_v) actor_loss_v = -crt_net.distr_to_q(crt_distr_v) actor_loss_v = actor_loss_v.mean() actor_loss_v.backward() act_opt.step() tb_tracker.track("loss_actor", actor_loss_v, frame_idx) tgt_act_net.alpha_sync(alpha=1 - 1e-3) tgt_crt_net.alpha_sync(alpha=1 - 1e-3) if frame_idx % TEST_ITERS == 0: print("testing") env.reset() ts = time.time() rewards, steps = test_net(act_net, env, device=device) print("Test done in %.2f sec, reward %.3f, steps %d" % (time.time() - ts, rewards, steps)) writer.add_scalar("test_reward", rewards, frame_idx) writer.add_scalar("test_steps", steps, frame_idx) if best_reward is None or best_reward < rewards: if best_reward is not None: print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards)) name = "best_%+.3f_%d.dat" % (rewards, frame_idx) fname = os.path.join(save_path, name) torch.save(act_net.state_dict(), fname) best_reward = rewards
frame_idx += 1 buffer.populate(1) rewards_steps = exp_source.pop_rewards_steps() if rewards_steps: rewards, steps = zip(*rewards_steps) tb_tracker.track("episode_steps", steps[0], frame_idx) tracker.reward(rewards[0], frame_idx) if len(buffer) < REPLAY_INITIAL: continue # Entnahme einer Stichprobe aus dem Buffer batch = buffer.sample(BATCH_SIZE) states_v, actions_v, rewards_v, \ dones_mask, last_states_v = \ common.unpack_batch_ddqn(batch, device) # Optimieren des Critic-Netzes crt_opt.zero_grad() crt_distr_v = crt_net(states_v, actions_v) last_act_v = tgt_act_net.target_model( last_states_v) last_distr_v = F.softmax( tgt_crt_net.target_model( last_states_v, last_act_v), dim=1) proj_distr_v = distr_projection( last_distr_v, rewards_v, dones_mask, gamma=GAMMA**REWARD_STEPS, device=device) prob_dist_v = -F.log_softmax( crt_distr_v, dim=1) * proj_distr_v # Berechnen der Verlustfunktion des Critic-Netzes
with ptan.common.utils.TBMeanTracker(writer, batch_size=10) as tb_tracker: while frame_idx < MAX_IDX: frame_idx += 1 buffer.populate(1) rewards_steps = exp_source.pop_rewards_steps() if rewards_steps: rewards, steps = zip(*rewards_steps) tb_tracker.track("episode_steps", steps[0], frame_idx) tracker.reward(rewards[0], frame_idx) if len(buffer) < REPLAY_INITIAL: continue batch = buffer.sample(BATCH_SIZE) states_v, actions_v, rewards_v, dones_mask, last_states_v = common.unpack_batch_ddqn( batch, device) # train critic crt_opt.zero_grad() q_v = crt_net(states_v, actions_v) last_act_v = tgt_act_net.target_model(last_states_v) q_last_v = tgt_crt_net.target_model(last_states_v, last_act_v) q_last_v[dones_mask] = 0.0 q_ref_v = rewards_v.unsqueeze( dim=-1) + q_last_v * (GAMMA**args.n) critic_loss_v = F.mse_loss(q_v, q_ref_v.detach()) critic_loss_v.backward() crt_opt.step() tb_tracker.track("loss_critic", critic_loss_v, frame_idx) tb_tracker.track("critic_ref", q_ref_v.mean(), frame_idx)
batch_size=10) as tb_tracker: while True: frame_idx += 1 buffer.populate(1) rewards_steps = exp_source.pop_rewards_steps() if rewards_steps: rewards, steps = zip(*rewards_steps) tb_tracker.track("episode_steps", steps[0], frame_idx) tracker.reward(rewards[0], frame_idx) if len(buffer) < REPLAY_INITIAL: continue batch = buffer.sample(BATCH_SIZE) states_v, actions_v, rewards_v, dones_mask, last_states_v = \ common.unpack_batch_ddqn(batch, cuda=args.cuda) # train critic crt_opt.zero_grad() crt_distr_v = crt_net(states_v, actions_v) last_act_v = tgt_act_net.target_model(last_states_v) last_distr_v = F.softmax(tgt_crt_net.target_model( last_states_v, last_act_v), dim=1) proj_distr_v = distr_projection(last_distr_v, rewards_v, dones_mask, gamma=GAMMA**REWARD_STEPS, cuda=args.cuda) prob_dist_v = -F.log_softmax(crt_distr_v, dim=1) * proj_distr_v critic_loss_v = prob_dist_v.sum(dim=1).mean()
with ptan.common.utils.RewardTracker(writer) as tracker: with ptan.common.utils.TBMeanTracker(writer, batch_size=10) as tb_tracker: while True: frame_idx += 1 buffer.populate(1) rewards_steps = exp_source.pop_rewards_steps() if rewards_steps: rewards, steps = zip(*rewards_steps) tb_tracker.track("episode_steps", steps[0], frame_idx) tracker.reward(rewards[0], frame_idx) if len(buffer) < REPLAY_INITIAL: continue batch = buffer.sample(BATCH_SIZE) states_v, actions_v, rewards_v, dones_mask, last_states_v = common.unpack_batch_ddqn(batch, device) # train critic crt_opt.zero_grad() q_v = crt_net(states_v, actions_v) last_act_v = tgt_act_net.target_model(last_states_v) q_last_v = tgt_crt_net.target_model(last_states_v, last_act_v) q_last_v[dones_mask] = 0.0 q_ref_v = rewards_v.unsqueeze(dim=-1) + q_last_v * GAMMA critic_loss_v = F.mse_loss(q_v, q_ref_v.detach()) critic_loss_v.backward() crt_opt.step() tb_tracker.track("loss_critic", critic_loss_v, frame_idx) tb_tracker.track("critic_ref", q_ref_v.mean(), frame_idx) # train actor