예제 #1
0
def log(model, i):
    mmm = []
    for loader in a_loader, b_loader, c_loader:
        y, y_bar = infer(loader, model)

        tp = utils.tp(y, y_bar) / len(y)
        fp = utils.fp(y, y_bar) / len(y)
        fn = utils.fn(y, y_bar) / len(y)
        tn = utils.tn(y, y_bar) / len(y)

        a = tp + tn
        p = utils.div(tp, tp + fp)
        r = utils.div(tp, p1)
        m = metric(p1, fn, fp)
        mmm.append([tp, fp, fn, tn, a, p, r, m])

    tagg = ['tp', 'fp', 'fn', 'tn', 'a', 'p', 'r', args.metric]

    placeholder = '0' * (len(str(args.ni)) - len(str(i)))
    xx = ['/'.join(['%0.2f' % m for m in mm]) for mm in zip(*mmm)]
    x = ' | '.join('%s %s' % (tag, mm) for tag, mm in zip(tagg, xx))
    print('[iteration %s%d]%s' % ((placeholder, i, x)))

    if args.tb:
        for writer, mm in zip([a_writer, b_writer, c_writer], mmm):
            for tag, m in zip(tagg, mm):
                writer.add_scalar(tag, m, i)
예제 #2
0
def test_agent(env,
               agent,
               run=0,
               episodes=5,
               time_steps=500,
               initial_state=None,
               initial_noise=None,
               render=True,
               deterministic=True):

    stats = EpisodeStats(episode_lengths=np.zeros(episodes),
                         episode_rewards=np.zeros(episodes),
                         episode_loss=np.zeros(episodes))

    print_header(3, 'Testing')

    for e in range(episodes):

        s = env.reset(initial_state=initial_state,
                      noise_amplitude=initial_noise)

        for t in range(time_steps):

            if render:
                env.render()

            a = agent.get_action(s, deterministic=deterministic)
            s, r, d, _ = env.step(tn(a))

            stats.episode_rewards[e] += r
            stats.episode_lengths[e] = t

            if d:
                break

        pr_stats = {
            'run': run,
            'steps': int(stats.episode_lengths[e] + 1),
            'episode': e + 1,
            'episodes': episodes,
            'reward': stats.episode_rewards[e]
        }
        print_stats(pr_stats)

    if render:
        env.viewer.close()

    return stats
예제 #3
0
	def train(self, env, episodes, time_steps, initial_state=None, initial_noise=0.5):

		stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes),
							 episode_loss=np.zeros(episodes))

		self._run += 1

		for e in range(episodes):

			s = env.reset(initial_state=initial_state, noise_amplitude=initial_noise)

			for t in range(time_steps):

				a = self._actor.get_action(s, deterministic=False)
				ns, r, d, _ = env.step(tn(a))

				stats.episode_rewards[e] += r
				stats.episode_lengths[e] = t

				self._steps += 1
				self._replay_buffer.add_transition(s, a, ns, r, d)

				# Sample replay buffer
				b_states, b_actions, b_nstates, b_rewards, b_terminal = self._replay_buffer.random_next_batch(self._batch_size)

				# Get action according to target actor policy
				b_nactions = self._actor_target.get_action(b_nstates, deterministic=False)

				# Compute the target Q value from target critic
				target_Q1, target_Q2 = self._critic_target(b_nstates, b_nactions)
				target_Q = torch.min(target_Q1, target_Q2).reshape((-1))
				target_Q = b_rewards + (1 - b_terminal) * self._gamma * target_Q
				target_Q = target_Q.reshape((-1, 1)).detach()

				# Get current Q estimates from critic
				current_Q1, current_Q2 = self._critic(b_states, b_actions)

				# Compute critic loss
				critic_loss = self._critic_loss(current_Q1, target_Q) + self._critic_loss(current_Q2, target_Q)

				stats.episode_loss[e] += critic_loss.item()

				# Optimize the critic
				self._critic_optimizer.zero_grad()
				critic_loss.backward()
				self._critic_optimizer.step()

				# Delayed policy updates
				if self._steps % self._policy_freq == 0:

					# Compute actor losses by the deterministic policy gradient
					actor_loss = -self._critic.Q1(b_states, self._actor.get_action(b_states, deterministic=True)).mean()

					# Optimize the actor
					self._actor_optimizer.zero_grad()
					actor_loss.backward()
					self._actor_optimizer.step()

					# Soft-Update the target models
					soft_update(self._critic_target, self._critic, self._tau)
					soft_update(self._actor_target, self._actor, self._tau)

				if d:
					break
				s = ns

			pr_stats = {'run': self._run, 'steps': int(stats.episode_lengths[e] + 1),
						'episode': e + 1, 'episodes': episodes,
						'reward': stats.episode_rewards[e], 'loss': stats.episode_loss[e]}
			print_stats(pr_stats)

		return stats
예제 #4
0
    def train(self,
              env,
              episodes,
              time_steps,
              initial_state=None,
              initial_noise=0.5):

        stats = EpisodeStats(episode_lengths=np.zeros(episodes),
                             episode_rewards=np.zeros(episodes),
                             episode_loss=np.zeros(episodes))

        self._run += 1

        for e in range(episodes):
            # Generate an episode.
            # An episode is an array of (state, action, reward) tuples
            episode = []
            s = env.reset(initial_state=initial_state,
                          noise_amplitude=initial_noise)

            total_r = 0
            for t in range(time_steps):
                a = self._get_action(s)
                ns, r, d, _ = env.step(tn(self._action_fun.act2env(a)))

                stats.episode_rewards[e] += r
                stats.episode_lengths[e] = t

                episode.append((s, a, r))

                total_r += r

                if d:
                    break
                s = ns

            gamma_t = 1
            for t in range(len(episode)):
                # Find the first occurrence of the state in the episode
                s, a, r = episode[t]

                g = 0
                gamma_kt = 1
                for k in range(t, len(episode)):
                    gamma_kt = gamma_kt * self._gamma
                    _, _, r_k = episode[k]
                    g = g + (gamma_kt * r_k)

                g = float(g)

                p = self._pi(s, a)

                # For Numerical Stability, in order to not get probabilities higher than one (e.g. delta distribution)
                # and to not return a probability equal to 0 because of the log in the score_function
                eps = 1e-8
                p = p.clamp(eps, 1)

                log_p = torch.log(p)

                gamma_t = gamma_t * self._gamma

                if self._baseline:
                    bl = self.baseline_fun(s)
                    delta = g - bl

                    bl_loss = self._bl_loss_function(self.baseline_fun(s),
                                                     tt([g]))

                    self._bl_optimizer.zero_grad()
                    bl_loss.backward()
                    self._bl_optimizer.step()

                    score_fun = torch.mean(-(gamma_t * delta) * log_p)
                else:
                    score_fun = torch.mean(-(gamma_t * g) * log_p)

                stats.episode_loss[e] += score_fun.item()

                self._pi_optimizer.zero_grad()
                score_fun.backward()
                self._pi_optimizer.step()

            pr_stats = {
                'run': self._run,
                'steps': int(stats.episode_lengths[e] + 1),
                'episode': e + 1,
                'episodes': episodes,
                'reward': stats.episode_rewards[e],
                'loss': stats.episode_loss[e]
            }
            print_stats(pr_stats)

        return stats
예제 #5
0
def clip_action(a):
    return np.clip(tn(a), -1 + 1e-8, 1 - 1e-8)
예제 #6
0
def identity(a):
    return tn(a)