train_queue = mp.Queue(maxsize=PROCESSES_COUNT) data_proc_list = [] for _ in range(PROCESSES_COUNT): data_proc = mp.Process(target=data_func, args=(net, device, train_queue)) data_proc.start() data_proc_list.append(data_proc) batch_states = [] batch_actions = [] batch_vals_ref = [] step_idx = 0 batch_size = 0 try: with common.RewardTracker(writer, REWARD_BOUND) as tracker: with ptan.common.utils.TBMeanTracker(writer, 100) as tb_tracker: while True: train_entry = train_queue.get() if isinstance(train_entry, TotalReward): if tracker.reward(train_entry.reward, step_idx): break continue states_t, actions_t, vals_ref_t = train_entry batch_states.append(states_t) batch_actions.append(actions_t) batch_vals_ref.append(vals_ref_t) step_idx += states_t.size()[0] batch_size += states_t.size()[0] if batch_size < BATCH_SIZE:
net = model_vnc.ModelMultimodal(input_shape=wob_vnc.WOB_SHAPE, n_actions=env.action_space.n).to( device ) print(net) optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, eps=1e-3) preprocessor = model_vnc.MultimodalPreprocessor(device=device) agent = ptan.agent.PolicyAgent( lambda x: net(x)[0], device=device, apply_softmax=True, preprocessor=preprocessor ) exp_source = ptan.experience.ExperienceSourceFirstLast( [env], agent, gamma=GAMMA, steps_count=REWARD_STEPS, vectorized=True ) best_reward = None with common.RewardTracker(writer) as tracker: with ptan.common.utils.TBMeanTracker(writer, batch_size=10) as tb_tracker: batch = [] for step_idx, exp in enumerate(exp_source): rewards_steps = exp_source.pop_rewards_steps() if rewards_steps: rewards, steps = zip(*rewards_steps) tb_tracker.track("episode_steps", np.mean(steps), step_idx) mean_reward = tracker.reward(np.mean(rewards), step_idx) if mean_reward is not None: if best_reward is None or mean_reward > best_reward: if best_reward is not None: name = "best_%.3f_%d" % (mean_reward, step_idx) fname = os.path.join(saves_path, name) torch.save(net.state_dict(), fname + ".dat")
envs = [make_env() for _ in range(NUM_ENVS)] writer = SummaryWriter(comment="-pong-a2c_" + args.name) net = AtariA2C(envs[0].observation_space.shape, envs[0].action_space.n).to(device) print(net) agent = ptan.agent.PolicyAgent(lambda x: net(x)[0], apply_softmax=True, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS ) optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, eps=1e-3) batch = [] with common.RewardTracker(writer, stop_reward=18) as tracker: with ptan.common.utils.TBMeanTracker(writer, batch_size=10) as tb_tracker: for step_idx, exp in enumerate(exp_source): batch.append(exp) # handle new rewards new_rewards = exp_source.pop_total_rewards() if new_rewards: if tracker.reward(new_rewards[0], step_idx): break if len(batch) < BATCH_SIZE: continue states_v, actions_t, vals_ref_v = unpack_batch(batch, net, device=device) batch.clear()
def grads_func(proc_name, net, device, train_queue): envs = [make_env() for _ in range(NUM_ENVS)] agent = ptan.agent.PolicyAgent(lambda x: net(x)[0], device=device, apply_softmax=True) exp_source = ptan.experience.ExperienceSourceFirstLast( envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS ) batch = [] frame_idx = 0 writer = SummaryWriter(comment=proc_name) with common.RewardTracker(writer, REWARD_BOUND) as tracker: with ptan.common.utils.TBMeanTracker(writer, 100) as tb_tracker: for exp in exp_source: frame_idx += 1 new_rewards = exp_source.pop_total_rewards() if new_rewards and tracker.reward(new_rewards[0], frame_idx): break batch.append(exp) if len(batch) < GRAD_BATCH: continue data = common.unpack_batch( batch, net, device=device, last_val_gamma=GAMMA ** REWARD_STEPS ) states_v, actions_t, vals_ref_v = data batch.clear() net.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v) log_prob_v = F.log_softmax(logits_v, dim=1) adv_v = vals_ref_v - value_v.detach() log_p_a = log_prob_v[range(GRAD_BATCH), actions_t] log_prob_actions_v = adv_v * log_p_a loss_policy_v = -log_prob_actions_v.mean() prob_v = F.softmax(logits_v, dim=1) ent = (prob_v * log_prob_v).sum(dim=1).mean() entropy_loss_v = ENTROPY_BETA * ent loss_v = entropy_loss_v + loss_value_v + loss_policy_v loss_v.backward() tb_tracker.track("advantage", adv_v, frame_idx) tb_tracker.track("values", value_v, frame_idx) tb_tracker.track("batch_rewards", vals_ref_v, frame_idx) tb_tracker.track("loss_entropy", entropy_loss_v, frame_idx) tb_tracker.track("loss_policy", loss_policy_v, frame_idx) tb_tracker.track("loss_value", loss_value_v, frame_idx) tb_tracker.track("loss_total", loss_v, frame_idx) # gather gradients nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD) grads = [ param.grad.data.cpu().numpy() if param.grad is not None else None for param in net.parameters() ] train_queue.put(grads) train_queue.put(None)