def log(model, i): mmm = [] for loader in a_loader, b_loader, c_loader: y, y_bar = infer(loader, model) tp = utils.tp(y, y_bar) / len(y) fp = utils.fp(y, y_bar) / len(y) fn = utils.fn(y, y_bar) / len(y) tn = utils.tn(y, y_bar) / len(y) a = tp + tn p = utils.div(tp, tp + fp) r = utils.div(tp, p1) m = metric(p1, fn, fp) mmm.append([tp, fp, fn, tn, a, p, r, m]) tagg = ['tp', 'fp', 'fn', 'tn', 'a', 'p', 'r', args.metric] placeholder = '0' * (len(str(args.ni)) - len(str(i))) xx = ['/'.join(['%0.2f' % m for m in mm]) for mm in zip(*mmm)] x = ' | '.join('%s %s' % (tag, mm) for tag, mm in zip(tagg, xx)) print('[iteration %s%d]%s' % ((placeholder, i, x))) if args.tb: for writer, mm in zip([a_writer, b_writer, c_writer], mmm): for tag, m in zip(tagg, mm): writer.add_scalar(tag, m, i)
def test_agent(env, agent, run=0, episodes=5, time_steps=500, initial_state=None, initial_noise=None, render=True, deterministic=True): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes), episode_loss=np.zeros(episodes)) print_header(3, 'Testing') for e in range(episodes): s = env.reset(initial_state=initial_state, noise_amplitude=initial_noise) for t in range(time_steps): if render: env.render() a = agent.get_action(s, deterministic=deterministic) s, r, d, _ = env.step(tn(a)) stats.episode_rewards[e] += r stats.episode_lengths[e] = t if d: break pr_stats = { 'run': run, 'steps': int(stats.episode_lengths[e] + 1), 'episode': e + 1, 'episodes': episodes, 'reward': stats.episode_rewards[e] } print_stats(pr_stats) if render: env.viewer.close() return stats
def train(self, env, episodes, time_steps, initial_state=None, initial_noise=0.5): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes), episode_loss=np.zeros(episodes)) self._run += 1 for e in range(episodes): s = env.reset(initial_state=initial_state, noise_amplitude=initial_noise) for t in range(time_steps): a = self._actor.get_action(s, deterministic=False) ns, r, d, _ = env.step(tn(a)) stats.episode_rewards[e] += r stats.episode_lengths[e] = t self._steps += 1 self._replay_buffer.add_transition(s, a, ns, r, d) # Sample replay buffer b_states, b_actions, b_nstates, b_rewards, b_terminal = self._replay_buffer.random_next_batch(self._batch_size) # Get action according to target actor policy b_nactions = self._actor_target.get_action(b_nstates, deterministic=False) # Compute the target Q value from target critic target_Q1, target_Q2 = self._critic_target(b_nstates, b_nactions) target_Q = torch.min(target_Q1, target_Q2).reshape((-1)) target_Q = b_rewards + (1 - b_terminal) * self._gamma * target_Q target_Q = target_Q.reshape((-1, 1)).detach() # Get current Q estimates from critic current_Q1, current_Q2 = self._critic(b_states, b_actions) # Compute critic loss critic_loss = self._critic_loss(current_Q1, target_Q) + self._critic_loss(current_Q2, target_Q) stats.episode_loss[e] += critic_loss.item() # Optimize the critic self._critic_optimizer.zero_grad() critic_loss.backward() self._critic_optimizer.step() # Delayed policy updates if self._steps % self._policy_freq == 0: # Compute actor losses by the deterministic policy gradient actor_loss = -self._critic.Q1(b_states, self._actor.get_action(b_states, deterministic=True)).mean() # Optimize the actor self._actor_optimizer.zero_grad() actor_loss.backward() self._actor_optimizer.step() # Soft-Update the target models soft_update(self._critic_target, self._critic, self._tau) soft_update(self._actor_target, self._actor, self._tau) if d: break s = ns pr_stats = {'run': self._run, 'steps': int(stats.episode_lengths[e] + 1), 'episode': e + 1, 'episodes': episodes, 'reward': stats.episode_rewards[e], 'loss': stats.episode_loss[e]} print_stats(pr_stats) return stats
def train(self, env, episodes, time_steps, initial_state=None, initial_noise=0.5): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes), episode_loss=np.zeros(episodes)) self._run += 1 for e in range(episodes): # Generate an episode. # An episode is an array of (state, action, reward) tuples episode = [] s = env.reset(initial_state=initial_state, noise_amplitude=initial_noise) total_r = 0 for t in range(time_steps): a = self._get_action(s) ns, r, d, _ = env.step(tn(self._action_fun.act2env(a))) stats.episode_rewards[e] += r stats.episode_lengths[e] = t episode.append((s, a, r)) total_r += r if d: break s = ns gamma_t = 1 for t in range(len(episode)): # Find the first occurrence of the state in the episode s, a, r = episode[t] g = 0 gamma_kt = 1 for k in range(t, len(episode)): gamma_kt = gamma_kt * self._gamma _, _, r_k = episode[k] g = g + (gamma_kt * r_k) g = float(g) p = self._pi(s, a) # For Numerical Stability, in order to not get probabilities higher than one (e.g. delta distribution) # and to not return a probability equal to 0 because of the log in the score_function eps = 1e-8 p = p.clamp(eps, 1) log_p = torch.log(p) gamma_t = gamma_t * self._gamma if self._baseline: bl = self.baseline_fun(s) delta = g - bl bl_loss = self._bl_loss_function(self.baseline_fun(s), tt([g])) self._bl_optimizer.zero_grad() bl_loss.backward() self._bl_optimizer.step() score_fun = torch.mean(-(gamma_t * delta) * log_p) else: score_fun = torch.mean(-(gamma_t * g) * log_p) stats.episode_loss[e] += score_fun.item() self._pi_optimizer.zero_grad() score_fun.backward() self._pi_optimizer.step() pr_stats = { 'run': self._run, 'steps': int(stats.episode_lengths[e] + 1), 'episode': e + 1, 'episodes': episodes, 'reward': stats.episode_rewards[e], 'loss': stats.episode_loss[e] } print_stats(pr_stats) return stats
def clip_action(a): return np.clip(tn(a), -1 + 1e-8, 1 - 1e-8)
def identity(a): return tn(a)