def _generate(device, env, policy, ob_scale, number_timesteps, gamma, timesteps_per_batch): """ Generate trajectories """ record = ['o', 'a', 'r', 'done', 'vpred'] export = ['o', 'a', 'r', 'vpred'] trajectories = Trajectories(record, export, device, gamma, ob_scale) o = env.reset() infos = [] for n in range(1, number_timesteps + 1): # sample action with torch.no_grad(): logits, v = policy(scale_ob(o, device, ob_scale)) dist = torch.distributions.Categorical(logits=logits) a = dist.sample().cpu().numpy() v = v.cpu().numpy()[:, 0] # take action in env o_, r, done, info = env.step(a) for d in info: if d.get('episode'): infos.append(d['episode']) # store batch data and update observation trajectories.append(o, a, r, done, v) if n % timesteps_per_batch == 0: with torch.no_grad(): ob = scale_ob(o_, device, ob_scale) v_ = policy(ob)[1].cpu().numpy()[:, 0] * (1 - done) yield trajectories.export(v_) + (infos, ) infos.clear() o = o_
def _generate(device, env, qnet, ob_scale, number_timesteps, param_noise, exploration_fraction, exploration_final_eps): """ Generate training batch sample """ noise_scale = 1e-2 action_dim = env.action_space.n explore_steps = number_timesteps * exploration_fraction o = env.reset() infos = dict() epret = eplen = 0 for n in range(1, number_timesteps + 1): epsilon = 1.0 - (1.0 - exploration_final_eps) * n / explore_steps epsilon = max(exploration_final_eps, epsilon) # sample action with torch.no_grad(): ob = scale_ob(np.expand_dims(o, 0), device, ob_scale) q = qnet(ob) if not param_noise: if random.random() < epsilon: a = int(random.random() * action_dim) else: a = q.argmax(1).cpu().numpy()[0] else: # see Appendix C of `https://arxiv.org/abs/1706.01905` q_dict = deepcopy(qnet.state_dict()) for _, m in qnet.named_modules(): if isinstance(m, nn.Linear): std = torch.empty_like(m.weight).fill_(noise_scale) m.weight.data.add_(torch.normal(0, std).to(device)) std = torch.empty_like(m.bias).fill_(noise_scale) m.bias.data.add_(torch.normal(0, std).to(device)) q_perturb = qnet(ob) kl_perturb = ((log_softmax(q, 1) - log_softmax(q_perturb, 1)) * softmax(q, 1)).sum(-1).mean() kl_explore = -math.log(1 - epsilon + epsilon / action_dim) if kl_perturb < kl_explore: noise_scale *= 1.01 else: noise_scale /= 1.01 qnet.load_state_dict(q_dict) if random.random() < epsilon: a = int(random.random() * action_dim) else: a = q_perturb.argmax(1).cpu().numpy()[0] # take action in env o_, r, done, _ = env.step(a) epret += r eplen += 1 if done: infos = {'eprewmean': epret, 'eplenmean': eplen} epret = eplen = 0 # return data and update observation yield (o, [a], [r], o_, [int(done)], infos) infos = dict() o = o_ if not done else env.reset()