示例#1
0
    def train_step(self, rb: ReplayBuffer, sample_size=300):
        # loss calcualation
        trans_sts = rb.sample(sample_size)
        states = torch.stack([trans.state_tensor
                              for trans in trans_sts]).to(self.device)
        next_states = torch.stack(
            [trans.next_state_tensor for trans in trans_sts]).to(self.device)
        not_done = torch.Tensor([trans.not_done_tensor
                                 for trans in trans_sts]).to(self.device)
        actions = [trans.action for trans in trans_sts]
        rewards = torch.stack([trans.reward_tensor
                               for trans in trans_sts]).to(self.device)

        with torch.no_grad():
            qvals_predicted = self.tgt_model(next_states).max(-1)

        self.model.optimizer.zero_grad()
        qvals_current = self.model(states)
        one_hot_actions = torch.nn.functional.one_hot(
            torch.LongTensor(actions), self.num_actions).to(self.device)
        loss = ((rewards + (not_done * qvals_predicted.values) -
                 torch.sum(qvals_current * one_hot_actions, -1))**2).mean()
        loss.backward()
        self.model.optimizer.step()
        return loss.detach().item()
示例#2
0
    def train(self, replay_buffer: ReplayBuffer):
        """Train the Agent"""

        self.total_it += 1

        # Sample replay buffer
        state, action, reward, next_state, done = replay_buffer.sample(
        )  #sample 256 experiences

        with torch.no_grad():
            # Select action according to policy and add clipped noise
            noise = (torch.randn_like(action) * self.policy_noise).clamp(
                -self.noise_clip, self.noise_clip)

            next_action = (
                self.actor_target(next_state) +
                noise  #noise only set in training to prevent from overestimation
            ).clamp(-self.max_action, self.max_action)

            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_state,
                                                      next_action)  #Q1, Q2
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (1 -
                                 done) * self.discount * target_Q  #TD-target

        # Get current Q estimates
        current_Q1, current_Q2 = self.critic(state, action)  #Q1, Q2

        # Compute critic loss using MSE
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
            current_Q2, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Delayed policy updates (DDPG baseline = 1)
        if (self.total_it % self.policy_freq == 0):

            # Compute actor loss
            actor_loss = -self.critic(state, self.actor(state))[0].mean()

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Soft update by updating the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)
示例#3
0
class DDPG:
    def __init__(self, action_dim, action_bound, tau, lr_a, lr_c, state_dim,
                 gamma, batch_size):
        self.target = tf.placeholder(tf.float32, [None, 1], 'critic_target')
        self.s = tf.placeholder(tf.float32, [None, state_dim], 'state')
        self.s_ = tf.placeholder(tf.float32, [None, state_dim], 'next_state')

        self.memory = ReplayBuffer(max_size=10000)
        self.noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim))
        self.batch_size = batch_size
        self.gamma = gamma

        self.sess = tf.Session()

        self.actor = Actor(self.sess,
                           self.s,
                           self.s_,
                           action_dim,
                           action_bound,
                           tau,
                           lr_a,
                           f1_units=300)
        self.critic = Critic(self.sess,
                             lr_c,
                             self.s,
                             self.s_,
                             self.actor.a,
                             self.actor.a_,
                             self.target,
                             tau,
                             gamma,
                             state_dim,
                             action_dim,
                             f1_units=300)
        self.actor.add_grad_to_graph(self.critic.a_g)

        self.sess.run(tf.global_variables_initializer())

    def choose_action(self, s):
        a = self.actor.choose_action(s)
        var = self.noise()
        a = a + var
        return a[0]

    def update_target_networks(self):
        self.sess.run([self.actor.replace, self.critic.replace])

    def store(self, s, a, r, s_, done):
        self.memory.store(s, a, r, s_, done)

    def learn(self):
        bs, ba, br, bs_, _ = self.memory.sample(self.batch_size)

        q_ = self.sess.run(self.critic.q_, {self.s_: bs_})
        br = br[:, np.newaxis]
        target_critic = br + self.gamma * q_
        self.critic.learn(bs, ba, target_critic)
        self.actor.learn(bs)
        self.update_target_networks()
示例#4
0
class Simple_DQNAgent(Agent):
	"""
	This agent can handle the networks ConvDQN and LinearDQN. This agent uses a single DQN and a replay buffer for learning.
	"""
	def __init__(self, env, network, learning_rate, gamma, eps_max, eps_min, eps_dec, buffer_size):
		super().__init__(env, network, learning_rate, gamma, eps_max, eps_min, eps_dec)

		if self.network == "SimpleConvDQN":
			self.model = ConvDQN(env.env_shape, env.no_of_actions)
		elif self.network == "LinearDQN":
			self.model = LinearDQN(env.env_shape, env.no_of_actions)

		self.replay_buffer = ReplayBuffer(max_size=buffer_size, input_shape = env.env_shape)

	def get_action(self, state):

		if(np.random.randn() <= self.eps):
			return self.env.sample_action()

		else:
			state = T.tensor(state, dtype=T.float).unsqueeze(0).to(self.model.device)
			actions = self.model.forward(state)
			return T.argmax(actions).item()

	def update(self, batch_size):
		self.model.optimizer.zero_grad()

		batch = self.replay_buffer.sample(batch_size)
		states, actions, rewards, next_states, dones = batch
		states_t = T.tensor(states, dtype=T.float).to(self.model.device)
		actions_t = T.tensor(actions).to(self.model.device)
		rewards_t = T.tensor(rewards, dtype=T.float).to(self.model.device)
		next_states_t = T.tensor(next_states, dtype=T.float).to(self.model.device)

		curr_Q = self.model.forward(states_t).gather(1, actions_t.unsqueeze(1))
		curr_Q = curr_Q.squeeze(1)
		next_Q = self.model.forward(next_states_t)
		max_next_Q = T.max(next_Q, 1)[0]
		expected_Q = rewards_t + self.gamma * max_next_Q


		loss = self.model.MSE_loss(curr_Q, expected_Q).to(self.model.device)

		loss.backward()
		self.model.optimizer.step()

		self.dec_eps()

	def learn(self,state, action, reward, next_state, done, batch_size):
		self.replay_buffer.store_transition(state, action, reward, next_state, done)

		if len(self.replay_buffer) > batch_size:
			self.update(batch_size)
示例#5
0
    def generator(self):
        self.policy.init()
        time_step = self.env.reset()
        episode_reward = 0
        episode_time_steps = 0
        episode_num = 0

        state_shape = self.env.observation_spec().shape
        action_dim = self.env.action_spec().shape[0]
        replay_buffer = ReplayBuffer(state_shape, action_dim, max_size=self.buffer_size)
        next(self.rng)

        for t in (
            range(int(self.max_time_steps))
            if self.max_time_steps
            else itertools.count()
        ):
            episode_time_steps += 1
            state = time_step.observation

            # Select action randomly or according to policy
            action = yield state

            # Perform action
            time_step = self.env.step(action)
            done_bool = float(time_step.last())

            # Store data in replay buffer
            replay_buffer.add(
                state, action, time_step.observation, time_step.reward, done_bool
            )

            episode_reward += time_step.reward

            # Train agent after collecting sufficient data
            if t >= self.start_time_steps:
                for _ in range(self.train_steps):
                    data = replay_buffer.sample(next(self.rng), self.batch_size)
                    self.policy.update(**vars(data))

            if time_step.last():
                # +1 to account for 0 indexing. +0 on ep_time_steps since it will increment +1 even if done=True
                self.report(
                    time_steps=t + 1,
                    episode=episode_num + 1,
                    episode_time_steps=episode_time_steps,
                    reward=episode_reward,
                )
                # Reset environment
                time_step = self.env.reset()
                episode_reward = 0
                episode_time_steps = 0
                episode_num += 1
示例#6
0
class MADDPG():
    def __init__(self,
                 num_agents=2,
                 state_size=24,
                 action_size=2,
                 random_seed=2):
        self.num_agents = num_agents
        self.agents = [
            Agent(state_size, action_size, random_seed)
            for i in range(self.num_agents)
        ]
        self.memory = ReplayBuffer(action_size,
                                   buffer_size=BUFFER_SIZE,
                                   batch_size=MINI_BATCH,
                                   seed=random_seed)

    def act(self, states, add_noise=True):
        actions = []
        for state, agent in zip(states, self.agents):
            action = agent.act(state, add_noise)
            actions.append(action)
        return actions

    def reset(self):
        for agent in self.agents:
            agent.reset()

    def step(self, states, actions, rewards, next_states, dones):

        for i in range(self.num_agents):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            dones[i])

        if (len(self.memory) > MINI_BATCH):
            for _ in range(self.num_agents):
                experience = self.memory.sample()
                self.learn(experience)

    def learn(self, experiences, gamma=GAMMA):
        for agent in self.agents:
            agent.learn(experiences, gamma)
def play_and_train(env,
                   agent,
                   t_max=10**4,
                   replay_buffer: ReplayBuffer = None,
                   replay_batch_size: int = None):
    """
    This function should
    - run a full game, actions given by agent's e-greedy policy
    - train agent using agent.update(...) whenever it is possible
    - return total reward
    """
    total_reward = 0.0
    s = env.reset()

    for t in range(t_max):
        # get agent to pick action given state s.
        a = agent.get_action(s)

        next_s, r, done, _ = env.step(a)

        # train (update) agent for state s
        agent.update(s, a, r, next_s)

        if replay_buffer is not None:
            # store current <s,a,r,s'> transition in buffer
            replay_buffer.add(s, a, r, next_s, done)

            # sample replay_batch_size random transitions from replay,
            # then update agent on each of them in a loop
            s_, a_, r_, next_s_, done_ = replay_buffer.sample(
                replay_batch_size)
            for i in range(replay_batch_size):
                agent.update(s_[i], a_[i], r_[i], next_s_[i])

        s = next_s
        total_reward += r
        if done:
            break

    return total_reward
示例#8
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.name = "DDPG"
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 'actor_local')
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  'actor_target')

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   'critic_local')
        self.critic_target = Critic(self.state_size, self.action_size,
                                    'critic_target')

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # Reward counter
        self.total_reward = 0
        self.n_steps = 0

    def load(self):
        self.actor_local.load()
        self.actor_target.load()
        self.critic_local.load()
        self.critic_target.load()
        print("Agent's weights loaded from disk.")

    def save(self):
        self.actor_local.save()
        self.actor_target.save()
        self.critic_local.save()
        self.critic_target.save()
        print("Agent's weights saved to disk.")

    def reset_episode(self):
        self.total_reward = 0
        self.n_steps = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        # Add reward to total
        self.total_reward += reward
        self.n_steps += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state, add_noise=True):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        # Hack, rescale rotor revs to +-5 range from average
        # rev_mean = np.mean(action)
        # action = (action-450)/450
        # action *= 50
        # action += rev_mean

        if add_noise:
            action += self.noise.sample()  # additive noise for exploration
        return list(action)

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
def q_learning_fun(env,
                   q_func,
                   optimizer_spec,
                   exploration,
                   replay_buffer_size=1000000,
                   batch_size=32,
                   gamma=0.99,
                   learning_starts=50000,
                   learning_freq=4,
                   max_learning_steps=1000000,
                   frame_history_len=4,
                   target_update_freq=10000):

    if not os.path.isdir("./dqn"):
        os.mkdir("./dqn")

    img_h, img_w, img_c = env.observation_space.shape
    input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            with torch.no_grad():
                ret = model(obs).data.max(1)[1].cpu()
                return ret
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    save_best_mean_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000
    SAVE_EVERY_N_STEPS = 2000000

    for t in count():
        ### Check stopping criterion
        if env.get_total_steps() >= max_learning_steps:
            break

        last_idx = replay_buffer.store_frame(last_obs)
        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not learning
        if t > learning_starts:
            action = select_epilson_greedy_action(Q, recent_observations, t)[0]
        else:
            action = random.randrange(num_actions)
        obs, reward, done, _ = env.step(action)
        # clip rewards between -1 and 1
        reward = max(-1.0, min(reward, 1.0))
        # Store other info in replay memory
        replay_buffer.store_effect(last_idx, action, reward, done)
        # Resets the environment when reaching an episode boundary.
        if done:
            obs = env.reset()
        last_obs = obs

        if (t > learning_starts and t % learning_freq == 0):
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)
            # Convert numpy nd_array to torch variables for calculation
            obs_batch = torch.from_numpy(obs_batch).type(dtype) / 255.0
            act_batch = torch.from_numpy(act_batch).long()
            rew_batch = torch.from_numpy(rew_batch)
            next_obs_batch = torch.from_numpy(next_obs_batch).type(
                dtype) / 255.0
            not_done_mask = torch.from_numpy(1 - done_mask).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            # update rule
            cur_all_Q_values = Q(obs_batch)
            cur_act_Q_values = cur_all_Q_values.gather(
                1, act_batch.unsqueeze(1)).squeeze()
            next_all_target_Q_values = target_Q(next_obs_batch).detach()
            next_max_target_Q_values = next_all_target_Q_values.max(1)[0]

            next_max_target_Q_values = not_done_mask * next_max_target_Q_values

            target = rew_batch + (gamma * next_max_target_Q_values)
            error = target - cur_act_Q_values

            clipped_error = error.clamp(-1, 1)

            d_error = clipped_error * -1.0

            optimizer.zero_grad()

            cur_act_Q_values.backward(d_error.data)

            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        # Log progress and save of statistics
        episode_rewards = env.get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('./dqn/statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % './dqn/statistics.pkl')

            if save_best_mean_reward < best_mean_episode_reward:
                save_best_mean_reward = best_mean_episode_reward
                torch.save(Q.state_dict(), './dqn/best_model.pth')

        if t % SAVE_EVERY_N_STEPS == 0:
            torch.save(Q.state_dict(), './dqn/n_steps_%d.pth' % t)
示例#10
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 lr_actor=LR_ACTOR,
                 lr_critic=LR_CRITIC,
                 random_seed=42,
                 num_agents=1):
        """Initialize Agent object.
        
        Params
        ====
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            lr_actor (float): Learning rate for actor model
            lr_critic (float): Learning Rate for critic model
            random_seed (int): Random seed
            num_agents (int): Number of agents
            
        return 
        ====
            None
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.num_agents = num_agents

        # Initialize time step (for updating every hyperparameters["update_every"] steps)
        self.t_step = 0

        # Actor network
        self.actor = ActorNetwork(lr_actor,
                                  state_size,
                                  action_size,
                                  random_seed,
                                  name="actor")
        self.actor_target = ActorNetwork(lr_actor,
                                         state_size,
                                         action_size,
                                         random_seed,
                                         name="actor_target")

        self.soft_update(self.actor, self.actor_target, tau=1)

        # Critic network
        self.critic = CriticNetwork(lr_critic,
                                    state_size,
                                    action_size,
                                    random_seed,
                                    name="critic")
        self.critic_target = CriticNetwork(lr_critic,
                                           state_size,
                                           action_size,
                                           random_seed,
                                           name="critic_target")

        self.soft_update(self.critic, self.critic_target, tau=1)

        # Noise process
        self.noise = OUActionNoise(mu=np.zeros(action_size))

        # Replay buffer memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        # Support for multi agents learners
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)
        # Update timestep to learn
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and self.t_step == 0:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = T.from_numpy(state).float().to(device)
        self.actor.eval()
        with T.no_grad():
            actions = self.actor(states).cpu().data.numpy()
        self.actor.train()

        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic.optimizer.zero_grad()
        critic_loss.backward()
        T.nn.utils.clip_grad_norm_(self.critic.parameters(), 1.0)
        self.critic.optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor(states)
        actor_loss = -self.critic(states, actions_pred).mean()
        # Minimize the loss
        self.actor.optimizer.zero_grad()
        actor_loss.backward()
        self.actor.optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic, self.critic_target, TAU)
        self.soft_update(self.actor, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save_models(self):
        """ Save models weights """
        self.actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.actor_target.save_checkpoint()
        self.critic_target.save_checkpoint()

    def load_models(self):
        """ Load models weights """
        self.actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.actor_target.load_checkpoint()
        self.critic_target.load_checkpoint()
class AgentDDPG():
    def __init__(self, env):
        """

        :param task: (class instance) Instructions about the goal and reward
        """

        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        self.action_low = env.action_space.low
        self.action_high = env.action_space.high
        self.score = 0.0
        self.best = 0.0

        # Instances of the policy function or actor and the value function or critic
        # Actor critic with Advantage

        # Actor local and target
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)

        # Save actor model for future use
        actor_local_model_yaml = self.actor_local.model.to_yaml()
        with open("actor_local_model.yaml", "w") as yaml_file:
            yaml_file.write(actor_local_model_yaml)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic local and target
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model with local model
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Initialize the Gaussin Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Initialize the Replay Memory
        self.buffer_size = 100000
        self.batch_size = 64  # original 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Parameters for the Algorithm
        self.gamma = 0.99  # Discount factor
        self.tau = 0.01  # Soft update for target parameters Actor Critic with Advantage

    # Actor can reset the episode
    def reset_episode(self):
        # Your total reward goes to 0 same as your count
        self.total_reward = 0.0
        self.count = 0
        # Reset the gaussian noise
        self.noise.reset()
        # Gets a new state from the task
        state = self.env.reset()
        # Protect the state obtained from the task
        # by storing it as last state
        self.last_state = state
        # Return the state obtained from task
        return state

    # Actor interact with the environment
    def step(self, action, reward, next_state, done):
        # Add to the total reward the reward of this time step
        self.total_reward += reward
        # Increase your count based on the number of rewards
        # received in the episode
        self.count += 1
        # Stored previous state in the replay buffer
        self.memory.add(self.last_state, action, reward, next_state, done)
        # Check to see if you have enough to produce a batch
        # and learn from it
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            # Train the networks using the experiences
            self.learn(experiences)

        # Roll over last state action
        self.last_state = next_state

    # Actor determines what to do based on the policy
    def act(self, state):
        # Given a state return the action recommended by the policy
        # Reshape the state to fit the keras model input
        state = np.reshape(state, newshape=[-1, self.state_size])
        # Pass the state to the actor local model to get an action
        # recommend for the policy in a state
        action = self.actor_local.model.predict(state)[0]
        # Because we are exploring we add some noise to the
        # action vector
        return list(action + self.noise.sample())

    # This is the Actor learning logic called when the agent
    # take a step to learn
    def learn(self, experiences):
        """
        Learning means that the networks parameters needs to be updated
        Using the experineces batch.
        Network learns from experiences not form interaction with the
        environment
        """

        # Reshape the experience tuples in separate arrays of states, actions
        # rewards, next_state, done
        # Your are converting every memeber of the tuple in a column or vector
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Firs we pass a batch of next states to the actor so it tell us what actions
        # to execute, we use the actor target network instead of the actor local network
        # because of the advantage principle
        actions_next = self.actor_target.model.predict_on_batch(next_states)

        # The critic evaluates the actions taking by the actor and generates the
        # Q(a,s) value of those actions. This action, state tuple comes from the
        # ReplayBuffer not from interacting with the environment.
        # Remember the Critic or value function inputs is states, actions
        Q_targets_next = self.critic_target.model.predict_on_batch(
            ([next_states, actions_next]))

        # With the Q_targets_next that is a vector of action values Q(s,a) of a random selected
        # next_states from the replay buffer. We calculate the target Q(s,a).
        # For that we use the TD one-step Sarsa equations
        # We make terminal states target Q(s,a) 0 and Non terminal the Q_targtes value
        # This is done to train the critic in a supervise learning fashion.
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train the actor
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # Custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights)

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def get_episode_score(self):
        """
        Calculate the episode scores
        :return: None
        """
        # Update score and best score
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0
        if self.score > self.best:
            self.best = self.score

    def save_model_weights(self, actor_model):
        actor_model.model.save_weights('weights.h5')
示例#12
0
class DuelingDQNAgent(Agent):
	def __init__(self, env, network, learning_rate, gamma, eps_max, eps_min, eps_dec, buffer_size, replace_cnt):
		super().__init__(env, network, learning_rate, gamma, eps_max, eps_min, eps_dec)

		self.replay_buffer = ReplayBuffer(max_size=buffer_size, input_shape = env.env_shape)

		self.learn_step_counter = 0
		self.replace_cnt = replace_cnt
		self.q_eval = DuelingDQN(env.env_shape, env.no_of_actions)
		self.q_target = DuelingDQN(env.env_shape, env.no_of_actions)


	def get_action(self, state):

		if(np.random.randn() <= self.eps):
			return self.env.sample_action()

		else:
			state = T.tensor(state, dtype=T.float).unsqueeze(0).to(self.q_eval.device)
			_, advantage = self.q_eval.forward(state)
			return T.argmax(advantage).item()

	def replace_target_network(self):
		if self.learn_step_counter % self.replace_cnt == 0:
			self.q_target.load_state_dict(self.q_eval.state_dict())

	def get_batch_tensors(self, batch_size):
		batch = self.replay_buffer.sample(batch_size)
		states, actions, rewards, next_states, dones = batch
		states_t = T.tensor(states, dtype=T.float).to(self.q_eval.device)
		actions_t = T.tensor(actions).to(self.q_eval.device)
		rewards_t = T.tensor(rewards, dtype=T.float).to(self.q_eval.device)
		next_states_t = T.tensor(next_states, dtype=T.float).to(self.q_eval.device)
		return states_t, actions_t, rewards_t, next_states_t

	def update(self, batch_size):

		states_t, actions_t, rewards_t, next_states_t = self.get_batch_tensors(batch_size)
		self.q_eval.optimizer.zero_grad()

		self.replace_target_network()

		indices = np.arange(batch_size)
		Vs, As = self.q_eval.forward(states_t)
		curr_Q = T.add(Vs, (As - As.mean(dim=1, keepdim=True)))[indices, actions_t]

		Vns, Ans = self.q_target.forward(next_states_t)
		max_next_Q = T.add(Vs, (As - As.mean(dim=1, keepdim=True))).max(1)[0]

		expected_Q = rewards_t + self.gamma * max_next_Q

		loss = self.q_eval.MSE_loss(curr_Q, expected_Q).to(self.q_eval.device)

		loss.backward()
		self.q_eval.optimizer.step()
		self.learn_step_counter += 1

		self.dec_eps()

	def learn(self,state, action, reward, next_state, done, batch_size):
		self.replay_buffer.store_transition(state, action, reward, next_state, done)

		if len(self.replay_buffer) > batch_size:
			self.update(batch_size)
示例#13
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # Variables to store best score and scores
        self.best_score = -np.inf
        self.score_list = []

    def reset_episode(self):
        self.total_reward = 0.0
        self.count = 0

        self.noise.reset()
        state = self.task.reset()
        self.last_state = state

        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state
        self.last_state = next_state

        # Track rewards
        self.total_reward += reward
        self.count += 1
        if done:
            # Average total reward by step counts
            self.score = self.total_reward / float(
                self.count) if self.count else 0.0
            # Store scores and update the best core
            self.score_list.append(self.score)
            if self.score > self.best_score:
                self.best_score = self.score

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
示例#14
0
def train(target_vars, saver, sess, logger, resume_iter, env):
    tot_iter = int(FLAGS.nsteps // FLAGS.num_env)

    X = target_vars['X']
    X_NOISE = target_vars['X_NOISE']
    train_op = target_vars['train_op']
    loss_ml = target_vars['loss_ml']
    x_grad = target_vars['x_grad']
    x_mod = target_vars['x_mod']
    action_grad = target_vars['action_grad']
    X_START = target_vars['X_START']
    X_END = target_vars['X_END']
    X_PLAN = target_vars['X_PLAN']
    ACTION_PLAN = target_vars['ACTION_PLAN']
    ACTION_LABEL = target_vars['ACTION_LABEL']
    ACTION_NOISE = target_vars['ACTION_NOISE_LABEL']
    x_joint = target_vars['x_joint']
    actions = target_vars['actions']
    energy_pos = target_vars['energy_pos']
    energy_neg = target_vars['energy_neg']
    loss_total = target_vars['loss_total']
    dyn_loss = target_vars['dyn_loss']
    dyn_dist = target_vars['dyn_dist']

    ob = env.reset()[:, None, None, :]

    output = [train_op, x_mod]
    log_output = [
        train_op, dyn_loss, dyn_dist, energy_pos, energy_neg, loss_ml,
        loss_total, x_grad, action_grad, x_mod
    ]

    print(log_output)
    replay_buffer = ReplayBuffer(1000000)
    pos_replay_buffer = ReplayBuffer(1000000)

    epinfos = []
    points = []
    total_obs = []
    for itr in range(resume_iter, tot_iter):
        x_plan = np.random.uniform(
            -1.0, 1.0, (FLAGS.num_env, FLAGS.plan_steps, 1, FLAGS.latent_dim))
        action_plan = np.random.uniform(-1, 1,
                                        (FLAGS.num_env, FLAGS.plan_steps, 2))
        if FLAGS.datasource == "maze":
            x_end = np.tile(np.array([[0.7, -0.8]]),
                            (FLAGS.num_env, 1))[:, None, None, :]
        elif FLAGS.datasource == "reacher":
            x_end = np.tile(np.array([[0.7, 0.5]]),
                            (FLAGS.num_env, 1))[:, None, None, :]
        else:
            x_end = np.tile(np.array([[0.5, 0.5]]),
                            (FLAGS.num_env, 1))[:, None, None, :]

        x_traj, traj_actions = sess.run([x_joint, actions], {
            X_START: ob,
            X_PLAN: x_plan,
            X_END: x_end,
            ACTION_PLAN: action_plan
        })

        # Add some amount of exploration into predicted actions
        # traj_actions = traj_actions + np.random.uniform(-0.1, 0.1, traj_actions.shape)
        # traj_actions = np.clip(traj_actions, -1, 1)

        if FLAGS.debug:
            print(x_traj[0])

        obs = [ob[:, 0, 0, :]]
        dones = []
        diffs = []
        for i in range(traj_actions.shape[1] - 1):
            if FLAGS.random_action:
                action = np.random.uniform(-1, 1, traj_actions[:, i].shape)
            else:
                action = traj_actions[:, i]

            ob, _, done, infos = env.step(action)

            if i == 0:
                print(x_traj[0, 0], x_traj[0, 1], ob[0])
                target_ob = x_traj[:, i + 1]
                print("Abs dist: ", np.mean(np.abs(ob - target_ob)))

            dones.append(done)
            obs.append(ob)

            for info in infos:
                maybeepinfo = info.get('episode')
                if maybeepinfo: epinfos.append(maybeepinfo)

            diffs.append(np.abs(x_traj[:, i + 1] - ob).mean())

        ob = ob[:, None, None, :]
        dones = np.array(dones).transpose()
        obs = np.stack(obs, axis=1)[:, :, None, :]

        if FLAGS.heatmap:
            total_obs.append(obs.reshape((-1, FLAGS.latent_dim)))

        action, ob_pair = parse_valid_obs(obs, traj_actions, dones)

        # x_noise = np.stack([x_traj[:, :-1], x_traj[:, 1:]], axis=2)
        x_noise = np.stack([x_traj[:, :10], x_traj[:, 1:11]], axis=2)
        s = x_noise.shape
        x_noise_neg = x_noise.reshape((s[0] * s[1], s[2], s[3], s[4]))
        action_noise_neg = traj_actions[:, :-1]
        s = action_noise_neg.shape
        action_noise_neg = action_noise_neg.reshape((s[0] * s[1], s[2]))

        traj_action_encode = action.reshape((-1, 1, 1, FLAGS.action_dim))
        encode_data = np.concatenate([
            ob_pair,
            np.tile(traj_action_encode, (1, FLAGS.total_frame, 1, 1))
        ],
                                     axis=3)
        pos_replay_buffer.add(encode_data)

        if len(pos_replay_buffer
               ) > FLAGS.num_env * FLAGS.plan_steps and FLAGS.replay_batch:
            sample_data = pos_replay_buffer.sample(FLAGS.num_env *
                                                   FLAGS.plan_steps)
            sample_ob = sample_data[:, :, :, :-FLAGS.action_dim]
            sample_actions = sample_data[:, 0, 0, -FLAGS.action_dim:]

            ob_pair = np.concatenate([ob_pair, sample_ob], axis=0)
            action = np.concatenate([action, sample_actions], axis=0)

        feed_dict = {
            X: ob_pair,
            X_NOISE: x_noise_neg,
            ACTION_NOISE: action_noise_neg,
            ACTION_LABEL: action
        }

        batch_size = x_noise_neg.shape[0]
        if FLAGS.replay_batch and len(
                replay_buffer) > batch_size and not FLAGS.ff_model:
            replay_batch = replay_buffer.sample(int(batch_size / 2.))
            # replay_mask = (np.random.uniform(0, 1, (batch_size)) > 0.95)
            # feed_dict[X_NOISE][replay_mask] = replay_batch[replay_mask]
            feed_dict[X_NOISE] = np.concatenate(
                [feed_dict[X_NOISE], replay_batch], axis=0)

        if itr % FLAGS.log_interval == 0:
            _, dyn_loss, dyn_dist, e_pos, e_neg, loss_ml, loss_total, x_grad, action_grad, x_mod = sess.run(
                log_output, feed_dict=feed_dict)
            kvs = {}
            kvs['e_pos'] = e_pos.mean()
            kvs['e_neg'] = e_neg.mean()
            kvs['loss_ml'] = loss_ml.mean()
            kvs['loss_total'] = loss_total.mean()
            kvs['x_grad'] = np.abs(x_grad).mean()
            kvs['action_grad'] = np.abs(action_grad).mean()
            kvs['dyn_loss'] = dyn_loss.mean()
            kvs['dyn_dist'] = np.abs(dyn_dist).mean()
            kvs['iter'] = itr
            kvs["train_episode_length_mean"] = safemean(
                [epinfo['l'] for epinfo in epinfos])
            kvs["diffs"] = diffs[-1]

            epinfos = []

            string = "Obtained a total of "
            for key, value in kvs.items():
                string += "{}: {}, ".format(key, value)

            print(string)
            logger.writekvs(kvs)
        else:
            _, x_mod = sess.run(output, feed_dict=feed_dict)

        if FLAGS.replay_batch and (x_mod is not None):
            replay_buffer.add(x_mod)
            replay_buffer.add(ob_pair)

        if itr % FLAGS.save_interval == 0:
            saver.save(
                sess, osp.join(FLAGS.logdir, FLAGS.exp,
                               'model_{}'.format(itr)))

        if FLAGS.heatmap and itr == 100:
            total_obs = np.concatenate(total_obs, axis=0)
            # total_obs = total_obs[np.random.permutation(total_obs.shape[0])[:1000000]]
            sns.kdeplot(data=total_obs[:, 0],
                        data2=total_obs[:, 1],
                        shade=True)
            plt.savefig("kde.png")
            assert False
示例#15
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed=0,
                 double_dqn=False,
                 dueling=False,
                 per=False,
                 per_args=(0.2, 0.01, 2e-5)):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            double_dqn (bool): whether to implement Double DQN (default=False)
            dueling (bool): whether to implement Dueling DQN
            per (bool): whether to implement Prioritized Experience Replay
            per_args (tuple): a,beta,beta_increment for PER
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.double_dqn = double_dqn
        self.per = per
        self.gamma = GAMMA

        # output name for checkpoint
        self.output_name = ''
        self.output_name += '_double' if double_dqn else ''
        self.output_name += '_dueling' if dueling else ''
        self.output_name += '_per' if per else ''

        # Q-Network
        self.qnetwork_local = QNetwork(state_size,
                                       action_size,
                                       seed,
                                       dueling=dueling).to(device)
        self.qnetwork_target = QNetwork(state_size,
                                        action_size,
                                        seed,
                                        dueling=dueling).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        if self.per:
            self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE,
                                                  BATCH_SIZE, seed, *per_args)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def train(self,
              env,
              n_episodes=1000,
              max_t=1000,
              eps_start=1.0,
              eps_end=0.01,
              eps_decay=0.995):
        """Deep Q-Learning.

        Params
        ======
            env (UnityEnvironment): Bananas environment
            n_episodes (int): maximum number of training episodes
            max_t (int): maximum number of timesteps per episode
            eps_start (float): starting value of epsilon, for epsilon-greedy action selection
            eps_end (float): minimum value of epsilon
            eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
        """
        # get the default brain
        brain_name = env.brain_names[0]
        brain = env.brains[brain_name]
        # list containing scores from each episode
        scores = []
        # list containing window averaged scores
        avg_scores = []
        # last 100 scores
        scores_window = deque(maxlen=100)
        # initialize epsilon
        eps = eps_start
        for i_episode in range(1, n_episodes + 1):
            env_info = env.reset(train_mode=True)[brain_name]
            state = env_info.vector_observations[0]
            score = 0
            for t in range(max_t):
                action = self.act(state, eps)
                env_info = env.step(action)[brain_name]
                # get the next state
                next_state = env_info.vector_observations[0]
                # get the reward
                reward = env_info.rewards[0]
                # see if episode has finished
                done = env_info.local_done[0]
                self.step((state, action, reward, next_state, done))
                state = next_state
                score += reward
                if done:
                    break
            # save most recent score
            scores_window.append(score)
            scores.append(score)
            avg_scores.append(np.mean(scores_window))
            # decrease epsilon
            eps = max(eps_end, eps_decay * eps)
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)),
                  end="")
            if i_episode % 100 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_window)))
            if np.mean(scores_window) >= 13.0:
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode - 100, np.mean(scores_window)))
                torch.save(self.qnetwork_local.state_dict(),
                           f'./checkpoints/checkpoint{self.output_name}.pth')
                break
        return scores, avg_scores

    def step(self, experience):
        """Save experience in replay memory and learn.
        
        Params
        ======
            experience (tuple): (state, action, reward, next_state, done)
        """
        # save experience
        self.memory.add(experience)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                self.learn()

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self):
        """Update value parameters using given batch of experience tuples.
        """
        # if using PER
        if self.per:
            states, actions, rewards, next_states, dones, idxs, is_weights = self.memory.sample(
            )

        # else normal replay buffer
        else:
            states, actions, rewards, next_states, dones = self.memory.sample()

        # if Double DQN
        if self.double_dqn:
            # Get predicted Q values (for next actions chosen by local model) from target model
            self.qnetwork_local.eval()
            with torch.no_grad():
                next_actions = self.qnetwork_local(next_states).detach().max(
                    1)[1].unsqueeze(1)
            self.qnetwork_local.train()
            Q_targets_next = self.qnetwork_target(next_states).gather(
                1, next_actions)

        else:
            # Get max predicted Q values (for next states) from target model
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        if self.per:
            loss = (torch.FloatTensor(is_weights) *
                    F.mse_loss(Q_expected, Q_targets)).mean()
        else:
            loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # if PER, update priority
        if self.per:
            errors = torch.abs(Q_expected - Q_targets).data.numpy()
            self.memory.update(idxs, errors)

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#16
0
class Trainer():
    def __init__(self, params: Parameters):
        self.parms = params

        self.env = Env(params.game,
                       params.gamma,
                       norm_rewards=None,
                       norm_states=False)

        self.buffer = ReplayBuffer(params.replay_size)

        # Seed
        self.env.seed(params.seed)
        np.random.seed(params.seed)
        tf.random.set_seed(params.seed)

        # Four critic nets
        critic_nets = [
            DDPGValueNet(feature_shape=self.env.features_shape,
                         a_num=self.env.num_actions,
                         lr=params.lr_c) for _ in range(4)
        ]
        self.critic1, self.critic2, self.target_critic1, self.target_critic2 = critic_nets

        # Two actor nets
        self.actor = CtsPolicy(action_bound=self.env.action_bound,
                               action_dim=self.env.num_actions,
                               lr=params.lr_a)
        self.target_actor = CtsPolicy(action_bound=self.env.action_bound,
                                      action_dim=self.env.num_actions,
                                      lr=params.lr_a)

        # Copy parms
        self._copy_para(self.critic1, self.target_critic1)
        self._copy_para(self.critic2, self.target_critic2)
        self._copy_para(self.actor, self.target_actor)

        self.train_step_cnt = 0

    def _copy_para(self, from_model, to_model):
        """
        Copy parameters for soft updating
        :param from_model: latest model
        :param to_model: target model
        :return: None
        """
        for i, j in zip(from_model.trainable_weights,
                        to_model.trainable_weights):
            j.assign(i)

    def _target_soft_update(self, net, target_net):
        """ soft update the target net with Polyak averaging """
        for target_param, param in zip(target_net.trainable_weights,
                                       net.trainable_weights):
            target_param.assign(  # copy weight value into target parameters
                target_param * (1.0 - self.parms.tau) + param * self.parms.tau)

    def _train(self):

        # Sample
        batch = self.buffer.sample(self.parms.batch_size)
        s = np.array([batch_[0] for batch_ in batch])
        a = np.array([batch_[1] for batch_ in batch])
        r = np.array([batch_[2] for batch_ in batch])
        s_next = np.array([batch_[3] for batch_ in batch])
        not_done = np.array([not batch_[4] for batch_ in batch])

        # Reshpe
        r = r[:, np.newaxis]
        not_done = not_done[:, np.newaxis]

        # Set target y
        pi_next = self.target_actor(s_next)
        a_next = pi_next.sample()
        q_next = tf.minimum(self.target_critic1([s_next, a_next]),
                            self.target_critic2([s_next, a_next]))
        y = r + self.parms.gamma * q_next * not_done

        # Train critic1
        with tf.GradientTape() as c1_tape:
            q1 = self.critic1([s, a])
            c1_loss = tf.losses.mean_squared_error(y, q1)
        c1_grads = c1_tape.gradient(c1_loss, self.critic1.trainable_weights)
        self.critic1.optimizer.apply_gradients(
            zip(c1_grads, self.critic1.trainable_weights))

        # Train critic2
        with tf.GradientTape() as c2_tape:
            q2 = self.critic2([s, a])
            c2_loss = tf.losses.mean_squared_error(y, q2)
        c2_grads = c2_tape.gradient(c2_loss, self.critic2.trainable_weights)
        self.critic2.optimizer.apply_gradients(
            zip(c2_grads, self.critic2.trainable_weights))

        # Train actor
        if self.train_step_cnt % self.parms.actor_interval == 0:

            with tf.GradientTape() as a_tape:
                pi = self.actor(s)
                a = pi.sample()
                q = self.critic1([s, a])
                a_loss = -tf.reduce_mean(q)
            a_grads = a_tape.gradient(a_loss, self.actor.trainable_weights)
            self.actor.optimizer.apply_gradients(
                zip(a_grads, self.actor.trainable_weights))

            # update parms
            self._target_soft_update(self.actor, self.target_actor)
            self._target_soft_update(self.critic1, self.target_critic1)
            self._target_soft_update(self.critic2, self.target_critic2)

    def train_step(self):

        # Episode infomation
        episode_ret = []

        # Initialize s
        s = self.env.reset()
        for _ in range(self.parms.train_step_len):
            # Interact
            pi = self.actor(s[np.newaxis, :])  # batch_size=1

            a = pi.sample()[0]
            s_next, r, done, info = self.env.step(a)

            # Store
            self.buffer.store((s, a, r, s_next, done))

            # Train
            if self.buffer.size() > self.parms.start_size:
                self._train()
                self.train_step_cnt += 1

            if done:
                _, ret = info['done']
                episode_ret.append(ret)
                s_next = self.env.reset()

            s = s_next

        return np.mean(episode_ret)
示例#17
0
文件: train.py 项目: bbeatrix/ebm_cl
def train(target_vars, saver, sess, logger, dataloaders, test_dataloaders,
          resume_iter, logdir):
    X = target_vars['X']
    Y = target_vars['Y']
    X_NOISE = target_vars['X_NOISE']
    train_op = target_vars['train_op']
    energy_pos = target_vars['energy_pos']
    energy_neg = target_vars['energy_neg']
    loss_energy = target_vars['loss_energy']
    loss_ml = target_vars['loss_ml']
    loss_total = target_vars['total_loss']
    gvs = target_vars['gvs']
    x_grad = target_vars['x_grad']
    x_grad_first = target_vars['x_grad_first']
    x_off = target_vars['x_off']
    temp = target_vars['temp']
    x_mod = target_vars['x_mod']
    LABEL = target_vars['LABEL']
    LABEL_POS = target_vars['LABEL_POS']
    weights = target_vars['weights']
    test_x_mod = target_vars['test_x_mod']
    eps = target_vars['eps_begin']
    label_ent = target_vars['label_ent']

    set_seed(0)
    np.random.seed(0)
    random.seed(0)

    if FLAGS.use_attention:
        gamma = weights[0]['atten']['gamma']
    else:
        gamma = tf.zeros(1)

    val_output = [test_x_mod]

    gvs_dict = dict(gvs)

    log_output = [
        train_op, energy_pos, energy_neg, eps, loss_energy, loss_ml,
        loss_total, x_grad, x_off, x_mod, gamma, x_grad_first, label_ent,
        *gvs_dict.keys()
    ]
    output = [train_op, x_mod]

    replay_buffer = ReplayBuffer(10000)
    itr = resume_iter
    x_mod = None
    gd_steps = 1

    err_message = 'Total number of epochs should be divisible by the number of CL tasks.'
    assert FLAGS.epoch_num % FLAGS.num_tasks == 0, err_message
    epochs_per_task = FLAGS.epoch_num // FLAGS.num_tasks // FLAGS.num_cycles

    for task_index, dataloader in enumerate(dataloaders):
        dataloader_iterator = iter(dataloader)
        best_inception = 0.0

        for epoch in range(1, epochs_per_task + 1):
            for data_corrupt, data, label in dataloader:
                print('Iter: {}; Epoch: {}/{}; Task: {}/{}'.format(
                    itr, epoch + (task_index * epochs_per_task),
                    FLAGS.epoch_num, task_index + 1, FLAGS.num_tasks))
                data_corrupt = data_corrupt_init = data_corrupt.numpy()
                data_corrupt_init = data_corrupt.copy()

                data = data.numpy()
                label = label.numpy()

                label_init = label.copy()

                if FLAGS.mixup:
                    idx = np.random.permutation(data.shape[0])
                    lam = np.random.beta(1, 1, size=(data.shape[0], 1, 1, 1))
                    data = data * lam + data[idx] * (1 - lam)

                if FLAGS.replay_batch and (x_mod is not None):
                    replay_buffer.add(compress_x_mod(x_mod))

                    if len(replay_buffer) > FLAGS.batch_size:
                        replay_batch = replay_buffer.sample(FLAGS.batch_size)
                        replay_batch = decompress_x_mod(replay_batch)
                        replay_mask = (np.random.uniform(
                            0, FLAGS.rescale, FLAGS.batch_size) > 0.05)
                        data_corrupt[replay_mask] = replay_batch[replay_mask]

                if FLAGS.pcd:
                    if x_mod is not None:
                        data_corrupt = x_mod

                feed_dict = {X_NOISE: data_corrupt, X: data, Y: label}

                if FLAGS.cclass:
                    feed_dict[LABEL] = label
                    feed_dict[LABEL_POS] = label_init

                if itr % FLAGS.log_interval == 0:
                    _, e_pos, e_neg, eps, loss_e, loss_ml, loss_total, x_grad, x_off, x_mod, gamma, x_grad_first, label_ent, * \
                        grads = sess.run(log_output, feed_dict)

                    kvs = {}
                    kvs['e_pos'] = e_pos.mean()
                    kvs['e_pos_std'] = e_pos.std()
                    kvs['e_neg'] = e_neg.mean()
                    kvs['e_diff'] = kvs['e_pos'] - kvs['e_neg']
                    kvs['e_neg_std'] = e_neg.std()
                    kvs['temp'] = temp
                    kvs['loss_e'] = loss_e.mean()
                    kvs['eps'] = eps.mean()
                    kvs['label_ent'] = label_ent
                    kvs['loss_ml'] = loss_ml.mean()
                    kvs['loss_total'] = loss_total.mean()
                    kvs['x_grad'] = np.abs(x_grad).mean()
                    kvs['x_grad_first'] = np.abs(x_grad_first).mean()
                    kvs['x_off'] = x_off.mean()
                    kvs['iter'] = itr
                    kvs['gamma'] = gamma

                    for v, k in zip(grads,
                                    [v.name for v in gvs_dict.values()]):
                        kvs[k] = np.abs(v).max()

                    string = "Obtained a total of "
                    for key, value in kvs.items():
                        string += "{}: {}, ".format(key, value)

                    if hvd.rank() == 0:
                        print(string)
                        logger.writekvs(kvs)
                        for key, value in kvs.items():
                            neptune.log_metric(key, x=itr, y=value)

                else:
                    _, x_mod = sess.run(output, feed_dict)

                if itr % FLAGS.save_interval == 0 and hvd.rank() == 0:
                    saver.save(
                        sess,
                        osp.join(FLAGS.logdir, FLAGS.exp,
                                 'model_{}'.format(itr)))

                if itr % FLAGS.test_interval == 0 and hvd.rank(
                ) == 0 and FLAGS.dataset != '2d':
                    if FLAGS.dataset == 'cifar10':
                        cifar10_map = {
                            0: 'airplane',
                            1: 'automobile',
                            2: 'bird',
                            3: 'cat',
                            4: 'deer',
                            5: 'dog',
                            6: 'frog',
                            7: 'horse',
                            8: 'ship',
                            9: 'truck'
                        }

                        imgs = data
                        labels = np.argmax(label, axis=1)
                        for idx, img in enumerate(imgs[:20, :, :, :]):
                            neptune.log_image(
                                'input_images',
                                rescale_im(imgs[idx]),
                                description=str(int(labels[idx])) + ': ' +
                                cifar10_map[int(labels[idx])])

                    if FLAGS.evaluate:
                        print('Test.')
                        train_acc = test_accuracy(target_vars, saver, sess,
                                                  logger, test_dataloaders[0])
                        test_acc = test_accuracy(target_vars, saver, sess,
                                                 logger, test_dataloaders[1])
                        neptune.log_metric('train_accuracy',
                                           x=itr,
                                           y=train_acc)
                        neptune.log_metric('test_accuracy', x=itr, y=test_acc)

                    try_im = x_mod
                    orig_im = data_corrupt.squeeze()
                    actual_im = rescale_im(data)

                    orig_im = rescale_im(orig_im)
                    try_im = rescale_im(try_im).squeeze()

                    for i, (im, t_im, actual_im_i) in enumerate(
                            zip(orig_im[:20], try_im[:20], actual_im)):
                        shape = orig_im.shape[1:]
                        new_im = np.zeros((shape[0], shape[1] * 3, *shape[2:]))
                        size = shape[1]
                        new_im[:, :size] = im
                        new_im[:, size:2 * size] = t_im
                        new_im[:, 2 * size:] = actual_im_i

                        log_image(new_im,
                                  logger,
                                  'train_gen_{}'.format(itr),
                                  step=i)
                        neptune.log_image(
                            'train_gen',
                            x=new_im,
                            description='train_gen_iter:{}_idx:{}'.format(
                                itr, i))
                    test_im = x_mod

                    try:
                        data_corrupt, data, label = next(dataloader_iterator)
                    except BaseException:
                        dataloader_iterator = iter(dataloader)
                        data_corrupt, data, label = next(dataloader_iterator)

                    data_corrupt = data_corrupt.numpy()

                    if FLAGS.replay_batch and (
                            x_mod is not None) and len(replay_buffer) > 0:
                        replay_batch = replay_buffer.sample(FLAGS.batch_size)
                        replay_batch = decompress_x_mod(replay_batch)
                        replay_mask = (np.random.uniform(
                            0, 1, (FLAGS.batch_size)) > 0.05)
                        data_corrupt[replay_mask] = replay_batch[replay_mask]

                    if FLAGS.dataset == 'cifar10' or FLAGS.dataset == 'imagenet' or FLAGS.dataset == 'imagenetfull':
                        n = 128

                        if FLAGS.dataset == "imagenetfull":
                            n = 32

                        if len(replay_buffer) > n:
                            data_corrupt = decompress_x_mod(
                                replay_buffer.sample(n))
                        elif FLAGS.dataset == 'imagenetfull':
                            data_corrupt = np.random.uniform(
                                0, FLAGS.rescale, (n, 128, 128, 3))
                        else:
                            data_corrupt = np.random.uniform(
                                0, FLAGS.rescale, (n, 32, 32, 3))

                        if FLAGS.dataset == 'cifar10':
                            label = np.eye(10)[np.random.randint(0, 10, (n))]
                        else:
                            label = np.eye(1000)[np.random.randint(
                                0, 1000, (n))]

                    feed_dict[X_NOISE] = data_corrupt

                    feed_dict[X] = data

                    if FLAGS.cclass:
                        feed_dict[LABEL] = label

                    test_x_mod = sess.run(val_output, feed_dict)

                    try_im = test_x_mod
                    orig_im = data_corrupt.squeeze()
                    actual_im = rescale_im(data.numpy())

                    orig_im = rescale_im(orig_im)
                    try_im = rescale_im(try_im).squeeze()

                    for i, (im, t_im, actual_im_i) in enumerate(
                            zip(orig_im[:20], try_im[:20], actual_im)):

                        shape = orig_im.shape[1:]
                        new_im = np.zeros((shape[0], shape[1] * 3, *shape[2:]))
                        size = shape[1]
                        new_im[:, :size] = im
                        new_im[:, size:2 * size] = t_im
                        new_im[:, 2 * size:] = actual_im_i
                        log_image(new_im,
                                  logger,
                                  'val_gen_{}'.format(itr),
                                  step=i)
                        neptune.log_image(
                            'val_gen',
                            new_im,
                            description='val_gen_iter:{}_idx:{}'.format(
                                itr, i))

                    score, std = get_inception_score(list(try_im), splits=1)
                    print("Inception score of {} with std of {}".format(
                        score, std))
                    kvs = {}
                    kvs['inception_score'] = score
                    kvs['inception_score_std'] = std
                    logger.writekvs(kvs)
                    for key, value in kvs.items():
                        neptune.log_metric(key, x=itr, y=value)

                    if score > best_inception:
                        best_inception = score
                        saver.save(
                            sess,
                            osp.join(FLAGS.logdir, FLAGS.exp, 'model_best'))

                if itr > 600000 and FLAGS.dataset == "mnist":
                    assert False
                itr += 1

        saver.save(sess,
                   osp.join(FLAGS.logdir, FLAGS.exp, 'model_{}'.format(itr)))
示例#18
0
class MADDPG():
    def __init__(self, state_size, action_size, num_agents, random_seed=0):
        in_critic = num_agents * state_size
        self.agents = [
            DDPG_agent(state_size, in_critic, action_size, num_agents,
                       random_seed) for i in range(num_agents)
        ]
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)
        self.num_agents = num_agents

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        actions = [
            agent.act(state, add_noise)
            for agent, state in zip(self.agents, states)
        ]
        return actions

    def target_act(self, states):
        """Returns actions for given state as per current policy."""
        actions = [
            agent.target_act(state)
            for agent, state in zip(self.agents, states)
        ]
        return actions

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        #for i in range(state.shape[0]):
        state = np.asanyarray(state)
        action = np.asanyarray(action)
        reward = np.asanyarray(reward)
        next_state = np.asanyarray(next_state)
        done = np.asanyarray(done)
        self.memory.add(state.reshape((1, self.num_agents, -1)), action.reshape((1, self.num_agents, -1)), \
                        reward.reshape((1, self.num_agents, -1)), next_state.reshape((1,self.num_agents, -1)), \
                        done.reshape((1, self.num_agents, -1)))

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            for i_agent in range(self.num_agents):
                experiences = self.memory.sample()
                self.learn(experiences, i_agent, GAMMA)

    def reset(self):
        [agent.reset() for agent in self.agents]

    def learn(self, experiences, i_agent, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        agent = self.agents[i_agent]
        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models

        next_states = next_states.view(1, BATCH_SIZE, self.num_agents, -1)
        actions_next = torch.cat(self.target_act(next_states), dim=1)
        next_states = next_states.view(BATCH_SIZE, -1)
        actions_next = actions_next.view(BATCH_SIZE, -1)

        Q_targets_next = agent.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards[:, i_agent] + (gamma * Q_targets_next *
                                           (1 - dones[:, i_agent]))
        # Compute critic loss
        Q_expected = agent.critic_local(states.view(BATCH_SIZE, -1),
                                        actions.view(BATCH_SIZE, -1))
        # mean squared error loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        # zero_grad because we do not want to accumulate
        # gradients from other batches, so needs to be cleared
        agent.critic_optimizer.zero_grad()
        # compute derivatives for all variables that
        # requires_grad-True
        critic_loss.backward()
        # update those variables that requires_grad-True
        agent.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        # take the current states and predict actions
        #states = states.view(1, BATCH_SIZE, self.num_agents, -1)
        actions_pred = agent.actor_local(states)
        #print (actions_pred.shape)
        #actions_pred = torch.cat(actions_pred, dim=1)
        # -1 * (maximize) Q value for the current prediction
        actor_loss = -agent.critic_local(states.view(
            BATCH_SIZE, -1), actions_pred.view(BATCH_SIZE, -1)).mean()
        # Minimize the loss
        # zero_grad because we do not want to accumulate
        # gradients from other batches, so needs to be cleared
        agent.actor_optimizer.zero_grad()
        # compute derivatives for all variables that
        # requires_grad-True
        actor_loss.backward()
        # update those variables that requires_grad-True
        agent.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(agent.critic_local, agent.critic_target, TAU)
        self.soft_update(agent.actor_local, agent.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
def gentest(sess, kvs, data, latents, save_exp_dir):
    X_NOISE = kvs['X_NOISE']
    LABEL_SIZE = kvs['LABEL_SIZE']
    LABEL_SHAPE = kvs['LABEL_SHAPE']
    LABEL_POS = kvs['LABEL_POS']
    LABEL_ROT = kvs['LABEL_ROT']
    model_size = kvs['model_size']
    model_shape = kvs['model_shape']
    model_pos = kvs['model_pos']
    model_rot = kvs['model_rot']
    weight_size = kvs['weight_size']
    weight_shape = kvs['weight_shape']
    weight_pos = kvs['weight_pos']
    weight_rot = kvs['weight_rot']
    X = tf.placeholder(shape=(None, 64, 64), dtype=tf.float32)

    datafull = data
    # Test combination of generalization where we use slices of both training
    x_final = X_NOISE
    x_mod_size = X_NOISE
    x_mod_pos = X_NOISE

    for i in range(FLAGS.num_steps):

        # use cond_pos

        energies = []
        x_mod_pos = x_mod_pos + tf.random_normal(tf.shape(x_mod_pos), mean=0.0, stddev=0.005)
        e_noise = model_pos.forward(x_final, weight_pos, label=LABEL_POS)

        # energies.append(e_noise)
        x_grad = tf.gradients(e_noise, [x_final])[0]
        x_mod_pos = x_mod_pos + tf.random_normal(tf.shape(x_mod_pos), mean=0.0, stddev=0.005)
        x_mod_pos = x_mod_pos - FLAGS.step_lr * x_grad
        x_mod_pos = tf.clip_by_value(x_mod_pos, 0, 1)

        if FLAGS.joint_shape:
            # use cond_shape
            e_noise = model_shape.forward(x_mod_pos, weight_shape, label=LABEL_SHAPE)
        elif FLAGS.joint_rot:
            e_noise = model_rot.forward(x_mod_pos, weight_rot, label=LABEL_ROT)
        else:
            # use cond_size
            e_noise = model_size.forward(x_mod_pos, weight_size, label=LABEL_SIZE)

        # energies.append(e_noise)
        # energy_stack = tf.concat(energies, axis=1)
        # energy_stack = tf.reduce_logsumexp(-1*energy_stack, axis=1)
        # energy_stack = tf.reduce_sum(energy_stack, axis=1)

        x_grad = tf.gradients(e_noise, [x_mod_pos])[0]
        x_mod_pos = x_mod_pos - FLAGS.step_lr * x_grad
        x_mod_pos = tf.clip_by_value(x_mod_pos, 0, 1)

        # for x_mod_size
        # use cond_size
        # e_noise = model_size.forward(x_mod_size, weight_size, label=LABEL_SIZE)
        # x_grad = tf.gradients(e_noise, [x_mod_size])[0]
        # x_mod_size = x_mod_size + tf.random_normal(tf.shape(x_mod_size), mean=0.0, stddev=0.005)
        # x_mod_size = x_mod_size - FLAGS.step_lr * x_grad
        # x_mod_size = tf.clip_by_value(x_mod_size, 0, 1)

        # # use cond_pos
        # e_noise = model_pos.forward(x_mod_size, weight_pos, label=LABEL_POS)
        # x_grad = tf.gradients(e_noise, [x_mod_size])[0]
        # x_mod_size = x_mod_size + tf.random_normal(tf.shape(x_mod_size), mean=0.0, stddev=0.005)
        # x_mod_size = x_mod_size - FLAGS.step_lr * tf.stop_gradient(x_grad)
        # x_mod_size = tf.clip_by_value(x_mod_size, 0, 1)

    x_mod = x_mod_pos
    x_final = x_mod


    if FLAGS.joint_shape:
        loss_kl = model_shape.forward(x_final, weight_shape, reuse=True, label=LABEL_SHAPE, stop_grad=True) + \
                  model_pos.forward(x_final, weight_pos, reuse=True, label=LABEL_POS, stop_grad=True)

        energy_pos = model_shape.forward(X, weight_shape, reuse=True, label=LABEL_SHAPE) + \
                      model_pos.forward(X, weight_pos, reuse=True, label=LABEL_POS)

        energy_neg = model_shape.forward(tf.stop_gradient(x_mod), weight_shape, reuse=True, label=LABEL_SHAPE) + \
                      model_pos.forward(tf.stop_gradient(x_mod), weight_pos, reuse=True, label=LABEL_POS)
    elif FLAGS.joint_rot:
        loss_kl = model_rot.forward(x_final, weight_rot, reuse=True, label=LABEL_ROT, stop_grad=True) + \
                  model_pos.forward(x_final, weight_pos, reuse=True, label=LABEL_POS, stop_grad=True)

        energy_pos = model_rot.forward(X, weight_rot, reuse=True, label=LABEL_ROT) + \
                      model_pos.forward(X, weight_pos, reuse=True, label=LABEL_POS)

        energy_neg = model_rot.forward(tf.stop_gradient(x_mod), weight_rot, reuse=True, label=LABEL_ROT) + \
                      model_pos.forward(tf.stop_gradient(x_mod), weight_pos, reuse=True, label=LABEL_POS)
    else:
        loss_kl = model_size.forward(x_final, weight_size, reuse=True, label=LABEL_SIZE, stop_grad=True) + \
                    model_pos.forward(x_final, weight_pos, reuse=True, label=LABEL_POS, stop_grad=True)

        energy_pos = model_size.forward(X, weight_size, reuse=True, label=LABEL_SIZE) + \
                      model_pos.forward(X, weight_pos, reuse=True, label=LABEL_POS)

        energy_neg = model_size.forward(tf.stop_gradient(x_mod), weight_size, reuse=True, label=LABEL_SIZE) + \
                      model_pos.forward(tf.stop_gradient(x_mod), weight_pos, reuse=True, label=LABEL_POS)

    energy_neg_reduced = (energy_neg - tf.reduce_min(energy_neg))
    coeff = tf.stop_gradient(tf.exp(-energy_neg_reduced))
    norm_constant = tf.stop_gradient(tf.reduce_sum(coeff)) + 1e-4
    neg_loss = coeff * (-1*energy_neg) / norm_constant

    loss_ml = tf.reduce_mean(energy_pos) - tf.reduce_mean(energy_neg)
    loss_total = loss_ml + tf.reduce_mean(loss_kl) + 1 * (tf.reduce_mean(tf.square(energy_pos)) + tf.reduce_mean(tf.square(energy_neg)))

    optimizer = AdamOptimizer(1e-3, beta1=0.0, beta2=0.999)
    gvs = optimizer.compute_gradients(loss_total)
    gvs = [(k, v) for (k, v) in gvs if k is not None]
    train_op = optimizer.apply_gradients(gvs)

    vs = optimizer.variables()
    sess.run(tf.variables_initializer(vs))

    dataloader = DataLoader(DSpritesGen(data, latents), batch_size=FLAGS.batch_size, num_workers=6, drop_last=True, shuffle=True)

    x_off = tf.reduce_mean(tf.square(x_mod - X))

    itr = 0
    saver = tf.train.Saver()
    x_mod = None


    if FLAGS.train:
        replay_buffer = ReplayBuffer(10000)
        for _ in range(1):


            for data_corrupt, data, label_size, label_pos in tqdm(dataloader):
                data_corrupt = data_corrupt.numpy()[:, :, :]
                data = data.numpy()[:, :, :]

                if x_mod is not None:
                    replay_buffer.add(x_mod)
                    replay_batch = replay_buffer.sample(FLAGS.batch_size)
                    replay_mask = (np.random.uniform(0, 1, (FLAGS.batch_size)) > 0.95)
                    data_corrupt[replay_mask] = replay_batch[replay_mask]

                if FLAGS.joint_shape:
                    feed_dict = {X_NOISE: data_corrupt, X: data, LABEL_SHAPE: label_size, LABEL_POS: label_pos}
                elif FLAGS.joint_rot:
                    feed_dict = {X_NOISE: data_corrupt, X: data, LABEL_ROT: label_size, LABEL_POS: label_pos}
                else:
                    feed_dict = {X_NOISE: data_corrupt, X: data, LABEL_SIZE: label_size, LABEL_POS: label_pos}

                _, off_value, e_pos, e_neg, x_mod = sess.run([train_op, x_off, energy_pos, energy_neg, x_final], feed_dict=feed_dict)
                itr += 1

                if itr % 10 == 0:
                    print("x_off of {}, e_pos of {}, e_neg of {} itr of {}".format(off_value, e_pos.mean(), e_neg.mean(), itr))

                if itr == FLAGS.break_steps:
                    break


        saver.save(sess, osp.join(save_exp_dir, 'model_gentest'))

    saver.restore(sess, osp.join(save_exp_dir, 'model_gentest'))

    l = latents

    if FLAGS.joint_shape:
        mask_gen = (l[:, 3] == 30 * np.pi / 39) * (l[:, 2] == 0.5)
    elif FLAGS.joint_rot:
        mask_gen = (l[:, 1] == 1) * (l[:, 2] == 0.5)
    else:
        mask_gen = (l[:, 3] == 30 * np.pi / 39) * (l[:, 1] == 1) & (~((l[:, 2] == 0.5) | ((l[:, 4] == 16/31) & (l[:, 5] == 16/31))))

    data_gen = datafull[mask_gen]
    latents_gen = latents[mask_gen]

    losses = []

    for dat, latent in zip(np.array_split(data_gen, 120), np.array_split(latents_gen, 120)):
        x = 0.5 + np.random.randn(*dat.shape)

        if FLAGS.joint_shape:
            feed_dict = {LABEL_SHAPE: np.eye(3)[latent[:, 1].astype(np.int32) - 1], LABEL_POS: latent[:, 4:], X_NOISE: x, X: dat}
        elif FLAGS.joint_rot:
            feed_dict = {LABEL_ROT: np.concatenate([np.cos(latent[:, 3:4]), np.sin(latent[:, 3:4])], axis=1), LABEL_POS: latent[:, 4:], X_NOISE: x, X: dat}
        else:
            feed_dict = {LABEL_SIZE: latent[:, 2:3], LABEL_POS: latent[:, 4:], X_NOISE: x, X: dat}

        for i in range(2):
            x = sess.run([x_final], feed_dict=feed_dict)[0]
            feed_dict[X_NOISE] = x

        loss = sess.run([x_off], feed_dict=feed_dict)[0]
        losses.append(loss)

    print("Mean MSE loss of {} ".format(np.mean(losses)))

    data_try = data_gen[:10]
    data_init = 0.5 + 0.5 * np.random.randn(10, 64, 64)
    latent_scale = latents_gen[:10, 2:3]
    latent_pos = latents_gen[:10, 4:]

    if FLAGS.joint_shape:
        feed_dict = {X_NOISE: data_init, LABEL_SHAPE: np.eye(3)[latent[:10, 1].astype(np.int32)-1], LABEL_POS: latent_pos}
    elif FLAGS.joint_rot:
        feed_dict = {LABEL_ROT: np.concatenate([np.cos(latent[:10, 3:4]), np.sin(latent[:10, 3:4])], axis=1), LABEL_POS: latent[:10, 4:], X_NOISE: data_init}
    else:
        feed_dict = {X_NOISE: data_init, LABEL_SIZE: latent_scale, LABEL_POS: latent_pos}

    x_output = sess.run([x_final], feed_dict=feed_dict)[0]

    if FLAGS.joint_shape:
        im_name = "size_shape_combine_gentest.png"
    else:
        im_name = "size_scale_combine_gentest.png"

    x_output_wrap = np.ones((10, 66, 66))
    data_try_wrap = np.ones((10, 66, 66))

    x_output_wrap[:, 1:-1, 1:-1] = x_output
    data_try_wrap[:, 1:-1, 1:-1] = data_try

    im_output = np.concatenate([x_output_wrap, data_try_wrap], axis=2).reshape(-1, 66*2)
    impath = osp.join(save_exp_dir, im_name)
    imsave(impath, im_output)
    print("Successfully saved images at {}".format(impath))
示例#20
0
class Trainer():
    def __init__(self, params: Parameters):
        self.parms = params

        self.env = Env(params.game,
                       params.gamma,
                       norm_rewards=None,
                       norm_states=False)

        self.buffer = ReplayBuffer(params.replay_size)

        # Seed
        self.env.seed(params.seed)
        np.random.seed(params.seed)
        tf.random.set_seed(params.seed)

        self.critic = DDPGValueNet(feature_shape=self.env.features_shape,
                                   a_num=self.env.num_actions,
                                   lr=params.lr_c)
        self.target_critic = DDPGValueNet(
            feature_shape=self.env.features_shape,
            a_num=self.env.num_actions,
            lr=params.lr_c)
        self._copy_para(self.critic.model, self.target_critic.model)

        self.actor = CtsPolicy(action_bound=self.env.action_bound,
                               action_dim=self.env.num_actions,
                               lr=params.lr_a)
        self.target_actor = CtsPolicy(action_bound=self.env.action_bound,
                                      action_dim=self.env.num_actions,
                                      lr=params.lr_a)
        self._copy_para(self.actor, self.target_actor)

        self.ema = tf.train.ExponentialMovingAverage(decay=1.0 -
                                                     self.parms.tau)

    def _copy_para(self, from_model, to_model):
        """
        Copy parameters for soft updating
        :param from_model: latest model
        :param to_model: target model
        :return: None
        """
        for i, j in zip(from_model.trainable_weights,
                        to_model.trainable_weights):
            j.assign(i)

    def _ema_update(self):

        paras = self.actor.trainable_weights + \
                self.critic.model.trainable_weights

        self.ema.apply(paras)

        for i, j in zip(self.target_actor.trainable_weights + \
            self.target_critic.model.trainable_weights, paras):
            i.assign(self.ema.average(j))

    def _train(self):

        # Sample
        batch = self.buffer.sample(self.parms.batch_size)
        s = np.array([batch_[0] for batch_ in batch])
        a = np.array([batch_[1] for batch_ in batch])
        r = np.array([batch_[2] for batch_ in batch])
        s_next = np.array([batch_[3] for batch_ in batch])
        not_done = np.array([not batch_[4] for batch_ in batch])

        # Reshpe
        r = r[:, np.newaxis]
        not_done = not_done[:, np.newaxis]

        # Train critic
        with tf.GradientTape() as tape:
            pi_next = self.target_actor(s_next)
            a_next = pi_next.sample()
            q_next = self.target_critic([s_next, a_next])
            y = r + self.parms.gamma * q_next * not_done
            q = self.critic([s, a])
            c_loss = tf.losses.mean_squared_error(y, q)
        c_grads = tape.gradient(c_loss, self.critic.model.trainable_weights)
        self.critic.model.optimizer.apply_gradients(
            zip(c_grads, self.critic.model.trainable_weights))

        # Train actor
        with tf.GradientTape() as tape:
            pi = self.actor(s)
            a = pi.sample()
            q = self.critic([s, a])
            a_loss = -tf.reduce_mean(q)
        a_grads = tape.gradient(a_loss, self.actor.trainable_weights)
        self.actor.optimizer.apply_gradients(
            zip(a_grads, self.actor.trainable_weights))

        self._ema_update()

    def train_step(self):

        # Episode infomation
        episode_ret = []

        # Initialize s
        s = self.env.reset()
        for _ in range(self.parms.train_step_len):
            # Interact
            pi = self.actor(s[np.newaxis, :])  # batch_size=1
            a = pi.sample()[0]
            s_next, r, done, info = self.env.step(a)

            # Store
            self.buffer.store((s, a, r, s_next, done))

            # Train
            if self.buffer.size() > self.parms.start_size:
                self._train()

            if done:
                _, ret = info['done']
                episode_ret.append(ret)
                s_next = self.env.reset()

            s = s_next

        return np.mean(episode_ret)
class Agent:

    def __init__(self, env, use_cnn=False, learning_rate=3e-4, gamma=0.99, buffer_size=10000):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.replay_buffer = ReplayBuffer(buffer_size)
        self.dqn = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) 
        self.dqn_optimizer = torch.optim.Adam(self.dqn.parameters())
        self.dqn_loss = torch.nn.MSELoss()

    def update_model(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size)
        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones)
        
        curr_Q = self.dqn.forward(states)
        curr_Q = curr_Q.gather(1, actions.unsqueeze(1)).squeeze(1)
        next_Q = self.dqn.forward(next_states)
        max_next_Q = torch.max(next_Q, 1)[0]
        expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q 

        self.dqn_optimizer.zero_grad()
        loss = self.dqn_loss(curr_Q, expected_Q)
        loss.backward()
        self.dqn_optimizer.step()
        
        return loss

    def max_action(self, state):
        state = autograd.Variable(torch.from_numpy(state).float().unsqueeze(0))
        qvals = self.dqn.forward(state)
        action = np.argmax(qvals.detach().numpy())
  
        return action
      
    def train(self, max_episodes, max_steps, batch_size):
        episode_rewards = []
        loss = []
        
        for episodes in range(max_episodes):
            state = self.env.reset()  
            episode_reward = 0
            for steps in range(max_steps):
                action = self.max_action(state)
                next_state, reward, done, _ = self.env.step(action)
                self.replay_buffer.push(state, action, reward, next_state, done)
                state = next_state
                episode_reward += reward
                
                if done:
                  episode_rewards.append(episode_reward)
                  print(episode_reward)
                  break
                
                if(len(self.replay_buffer) > batch_size):
                    step_loss = self.update_model(batch_size)
                    loss.append(step_loss)
                    #self.adjust_temperature(loss)
                
        # return episode_rewards, loss
                  
    def run(self, max_episodes, max_steps):
        episode_rewards = []
        for episodes in range(max_episodes):
            state = self.env.reset()  
            episode_reward = 0
            for steps in range(max_steps):
                action = self.max_action(state)
                next_state, reward, done, _ = env.step(action)
                state = next_state
                episode_reward += reward
                  
                if done:
                  episode_rewards.append(episode_reward)
                  break
                  
        return episode_rewards

    def save_model(self, PATH):
        torch.save(self.dqn.state_dict(), PATH)
示例#22
0
class DDPGAgent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """ Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # for MADDPG
        self.num_agents = num_agents

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)
        self.eps = EPS_START
        self.eps_decay = 1 / (EPS_EP_END * LEARN_NUM)
        self.timestep = 0

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done, agent_number):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.timestep += 1
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)
        # Learn, if enough samples are available in memory and at learning interval settings
        if len(self.memory) > BATCH_SIZE and self.timestep % LEARN_EVERY == 0:
            for _ in range(LEARN_NUM):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA, agent_number)

    def act(self, states, add_noise):
        """Returns actions for both agents as per current policy, given their respective states."""
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            # For MADDPG: get action for each agent and concatenate them
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        actions = np.clip(actions, -1, 1)
        return actions

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_number):
        """Update policy and value parameters using given batch of experience tuples.
            Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
            where:
                actor_target(state) -> action
                critic_target(state, action) -> Q-value
            Params
            ======
                experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        # Construct next actions vector relative to the agent
        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)
        # Compute Q targets for current states (y_i)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # For MADDPG: Construct action vector for each agent
        actions_pred = self.actor_local(states)
        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)
        # Compute actor loss
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # update noise decay parameter
        #self.eps -= self.eps_decay
        #self.eps = max(self.eps, EPS_FINAL)
        #self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters."""
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save_checkpoint(self, agent_number, filename='checkpoint'):
        checkpoint = {
            'action_size': self.action_size,
            'state_size': self.state_size,
            'actor_state_dict': self.actor_local.state_dict(),
            'critic_state_dict': self.critic_local.state_dict()
        }
        filepath = filename + '_' + str(agent_number) + '.pth'
        torch.save(checkpoint, filepath)
        print(filepath + ' succesfully saved.')

    def load_checkpoint(self, agent_number, filename='checkpoint'):
        filepath = filename + '_' + str(agent_number) + '.pth'
        checkpoint = torch.load(filepath)
        state_size = checkpoint['state_size']
        action_size = checkpoint['action_size']
        self.actor_local = Actor(state_size, action_size, seed=42).to(device)
        self.critic_local = Critic(state_size, action_size, seed=42).to(device)
        self.actor_local.load_state_dict(checkpoint['actor_state_dict'])
        self.critic_local.load_state_dict(checkpoint['critic_state_dict'])
        print(filepath + ' successfully loaded.')
示例#23
0
        retrace = truncated_rho * (retrace -
                                   q_value.detach()) + values[step].detach()

        loss += actor_loss + critic_loss - entropy

        if args.type == 'trpo':
            loss = TRPO(model, policies, average_policies, 1, loss,
                        policies[step] / average_policies[step])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if args.batch_size < len(replay_buffer) + 1:
        for _ in range(np.random.poisson(args.replay_ratio)):
            trajecs = replay_buffer.sample(args.batch_size)
            s_x, a_x, r_x, old_pol, m_x = map(
                torch.stack,
                zip(*(map(torch.cat, zip(*trajec)) for trajec in trajecs)))

            q_vals = []
            vals = []
            pols = []
            avg_pols = []

            for step in range(s_x.size(0)):
                pol, q_val, val = model(s_x[step])
                q_vals.append(q_val)
                pols.append(pol)
                vals.append(val)
示例#24
0
class DQNAgent(Agent):
	"""
	Uses a replay buffer and has two DQNs, one that is used to get best actions and updated every step and the other, a target network,
	used to compute the target Q value every step. This target network is only updated with the first DQN only after a fixed number of steps.
	"""
	def __init__(self, env, network, learning_rate, gamma, eps_max, eps_min, eps_dec, buffer_size, replace_cnt):
		super().__init__(env, network, learning_rate, gamma, eps_max, eps_min, eps_dec)

		self.replay_buffer = ReplayBuffer(max_size=buffer_size, input_shape = env.env_shape)

		self.learn_step_counter = 0
		self.replace_cnt = replace_cnt
		self.q_eval = ConvDQN(env.env_shape, env.no_of_actions)
		self.q_target = ConvDQN(env.env_shape, env.no_of_actions)


	def get_action(self, state):

		if(np.random.randn() <= self.eps):
			return self.env.sample_action()

		else:
			state = T.tensor(state, dtype=T.float).unsqueeze(0).to(self.q_eval.device)
			actions = self.q_eval.forward(state)
			return T.argmax(actions).item()

	def replace_target_network(self):
		if self.learn_step_counter % self.replace_cnt == 0:
			self.q_target.load_state_dict(self.q_eval.state_dict())

	def get_batch_tensors(self, batch_size):
		batch = self.replay_buffer.sample(batch_size)
		states, actions, rewards, next_states, dones = batch
		states_t = T.tensor(states, dtype=T.float).to(self.q_eval.device)
		actions_t = T.tensor(actions).to(self.q_eval.device)
		rewards_t = T.tensor(rewards, dtype=T.float).to(self.q_eval.device)
		next_states_t = T.tensor(next_states, dtype=T.float).to(self.q_eval.device)
		return states_t, actions_t, rewards_t, next_states_t

	def update(self, batch_size):

		states_t, actions_t, rewards_t, next_states_t = self.get_batch_tensors(batch_size)
		self.q_eval.optimizer.zero_grad()

		self.replace_target_network()

		indices = np.arange(batch_size)
		curr_Q = self.q_eval.forward(states_t)[indices, actions_t]
		max_next_Q = self.q_target.forward(next_states_t).max(1)[0]
		expected_Q = rewards_t + self.gamma * max_next_Q

		loss = self.q_eval.MSE_loss(curr_Q, expected_Q).to(self.q_eval.device)

		loss.backward()
		self.q_eval.optimizer.step()
		self.learn_step_counter += 1

		self.dec_eps()

	def learn(self,state, action, reward, next_state, done, batch_size):
		self.replay_buffer.store_transition(state, action, reward, next_state, done)

		if len(self.replay_buffer) > batch_size:
			self.update(batch_size)
示例#25
0
def train(target_vars, saver, sess, logger, dataloader, resume_iter, logdir):
    X = target_vars['X']
    Y = target_vars['Y']
    X_NOISE = target_vars['X_NOISE']
    train_op = target_vars['train_op']
    energy_pos = target_vars['energy_pos']
    energy_neg = target_vars['energy_neg']
    loss_energy = target_vars['loss_energy']
    loss_ml = target_vars['loss_ml']
    loss_total = target_vars['total_loss']
    gvs = target_vars['gvs']
    x_grad = target_vars['x_grad']
    x_grad_first = target_vars['x_grad_first']
    x_off = target_vars['x_off']
    temp = target_vars['temp']
    x_mod = target_vars['x_mod']
    LABEL = target_vars['LABEL']
    LABEL_POS = target_vars['LABEL_POS']
    weights = target_vars['weights']
    test_x_mod = target_vars['test_x_mod']
    eps = target_vars['eps_begin']
    label_ent = target_vars['label_ent']

    if FLAGS.use_attention:
        gamma = weights[0]['atten']['gamma']
    else:
        gamma = tf.zeros(1)

    val_output = [test_x_mod]

    gvs_dict = dict(gvs)

    log_output = [
        train_op, energy_pos, energy_neg, eps, loss_energy, loss_ml,
        loss_total, x_grad, x_off, x_mod, gamma, x_grad_first, label_ent,
        *gvs_dict.keys()
    ]
    output = [train_op, x_mod]

    replay_buffer = ReplayBuffer(10000)
    itr = resume_iter
    x_mod = None
    gd_steps = 1

    dataloader_iterator = iter(dataloader)
    best_inception = 0.0

    for epoch in range(FLAGS.epoch_num):
        print("Training epoch:%d" % epoch)
        for data_corrupt, data, label in dataloader:
            data_corrupt = data_corrupt_init = data_corrupt.numpy()
            data_corrupt_init = data_corrupt.copy()

            data = data.numpy()
            label = label.numpy()

            label_init = label.copy()

            if FLAGS.mixup:
                idx = np.random.permutation(data.shape[0])
                lam = np.random.beta(1, 1, size=(data.shape[0], 1, 1, 1))
                data = data * lam + data[idx] * (1 - lam)

            if FLAGS.replay_batch and (x_mod is not None):
                replay_buffer.add(compress_x_mod(x_mod))

                if len(replay_buffer) > FLAGS.batch_size:
                    replay_batch = replay_buffer.sample(FLAGS.batch_size)
                    replay_batch = decompress_x_mod(replay_batch)
                    replay_mask = (np.random.uniform(0, FLAGS.rescale,
                                                     FLAGS.batch_size) > 0.05)
                    data_corrupt[replay_mask] = replay_batch[replay_mask]

            if FLAGS.pcd:
                if x_mod is not None:
                    data_corrupt = x_mod

            feed_dict = {X_NOISE: data_corrupt, X: data, Y: label}

            if FLAGS.cclass:
                feed_dict[LABEL] = label
                feed_dict[LABEL_POS] = label_init

            if itr % FLAGS.log_interval == 0:
                _, e_pos, e_neg, eps, loss_e, loss_ml, loss_total, x_grad, x_off, x_mod, gamma, x_grad_first, label_ent, * \
                    grads = sess.run(log_output, feed_dict)

                kvs = {}
                kvs['e_pos'] = e_pos.mean()
                kvs['e_pos_std'] = e_pos.std()
                kvs['e_neg'] = e_neg.mean()
                kvs['e_diff'] = kvs['e_pos'] - kvs['e_neg']
                kvs['e_neg_std'] = e_neg.std()
                kvs['temp'] = temp
                kvs['loss_e'] = loss_e.mean()
                kvs['eps'] = eps.mean()
                kvs['label_ent'] = label_ent
                kvs['loss_ml'] = loss_ml.mean()
                kvs['loss_total'] = loss_total.mean()
                kvs['x_grad'] = np.abs(x_grad).mean()
                kvs['x_grad_first'] = np.abs(x_grad_first).mean()
                kvs['x_off'] = x_off.mean()
                kvs['iter'] = itr
                kvs['gamma'] = gamma

                for v, k in zip(grads, [v.name for v in gvs_dict.values()]):
                    kvs[k] = np.abs(v).max()

                string = "Obtained a total of "
                for key, value in kvs.items():
                    string += "{}: {}, ".format(key, value)

                if hvd.rank() == 0:
                    print(string)
                    logger.writekvs(kvs)
            else:
                _, x_mod = sess.run(output, feed_dict)

            if itr % FLAGS.save_interval == 0 and hvd.rank() == 0:
                saver.save(
                    sess,
                    osp.join(FLAGS.logdir, FLAGS.exp, 'model_{}'.format(itr)))

            if itr % FLAGS.test_interval == 0 and hvd.rank(
            ) == 0 and FLAGS.dataset != '2d':
                try_im = x_mod
                orig_im = data_corrupt.squeeze()
                actual_im = rescale_im(data)

                orig_im = rescale_im(orig_im)
                try_im = rescale_im(try_im).squeeze()

                for i, (im, t_im, actual_im_i) in enumerate(
                        zip(orig_im[:20], try_im[:20], actual_im)):
                    shape = orig_im.shape[1:]
                    new_im = np.zeros((shape[0], shape[1] * 3, *shape[2:]))
                    size = shape[1]
                    new_im[:, :size] = im
                    new_im[:, size:2 * size] = t_im
                    new_im[:, 2 * size:] = actual_im_i

                    log_image(new_im,
                              logger,
                              'train_gen_{}'.format(itr),
                              step=i)

                test_im = x_mod

                try:
                    data_corrupt, data, label = next(dataloader_iterator)
                except BaseException:
                    dataloader_iterator = iter(dataloader)
                    data_corrupt, data, label = next(dataloader_iterator)

                data_corrupt = data_corrupt.numpy()

                if FLAGS.replay_batch and (
                        x_mod is not None) and len(replay_buffer) > 0:
                    replay_batch = replay_buffer.sample(FLAGS.batch_size)
                    replay_batch = decompress_x_mod(replay_batch)
                    replay_mask = (np.random.uniform(0, 1, (FLAGS.batch_size))
                                   > 0.05)
                    data_corrupt[replay_mask] = replay_batch[replay_mask]

                if FLAGS.dataset == 'cifar10' or FLAGS.dataset == 'imagenet' or FLAGS.dataset == 'imagenetfull':
                    n = 128

                    if FLAGS.dataset == "imagenetfull":
                        n = 32

                    if len(replay_buffer) > n:
                        data_corrupt = decompress_x_mod(
                            replay_buffer.sample(n))
                    elif FLAGS.dataset == 'imagenetfull':
                        data_corrupt = np.random.uniform(
                            0, FLAGS.rescale, (n, 128, 128, 3))
                    else:
                        data_corrupt = np.random.uniform(
                            0, FLAGS.rescale, (n, 32, 32, 3))

                    if FLAGS.dataset == 'cifar10':
                        label = np.eye(10)[np.random.randint(0, 10, (n))]
                    else:
                        label = np.eye(1000)[np.random.randint(0, 1000, (n))]

                feed_dict[X_NOISE] = data_corrupt

                feed_dict[X] = data

                if FLAGS.cclass:
                    feed_dict[LABEL] = label

                test_x_mod = sess.run(val_output, feed_dict)

                try_im = test_x_mod
                orig_im = data_corrupt.squeeze()
                actual_im = rescale_im(data.numpy())

                orig_im = rescale_im(orig_im)
                try_im = rescale_im(try_im).squeeze()

                for i, (im, t_im, actual_im_i) in enumerate(
                        zip(orig_im[:20], try_im[:20], actual_im)):

                    shape = orig_im.shape[1:]
                    new_im = np.zeros((shape[0], shape[1] * 3, *shape[2:]))
                    size = shape[1]
                    new_im[:, :size] = im
                    new_im[:, size:2 * size] = t_im
                    new_im[:, 2 * size:] = actual_im_i
                    log_image(new_im, logger, 'val_gen_{}'.format(itr), step=i)

                score, std = get_inception_score(list(try_im), splits=1)
                print("///Inception score of {} with std of {}".format(
                    score, std))
                kvs = {}
                kvs['inception_score'] = score
                kvs['inception_score_std'] = std
                logger.writekvs(kvs)

                if score > best_inception:
                    best_inception = score
                    saver.save(sess,
                               osp.join(FLAGS.logdir, FLAGS.exp, 'model_best'))

            if itr > 60000 and FLAGS.dataset == "mnist":
                assert False
            itr += 1
            print("Training iteration:%d" % itr)

    saver.save(sess, osp.join(FLAGS.logdir, FLAGS.exp, 'model_{}'.format(itr)))
示例#26
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 policy_network,
                 value_network,
                 n_agents,
                 device,
                 use_gae=True):

        self.state_size = state_size
        self.action_size = action_size
        self.n_agents = n_agents
        self.device = device

        self.policy_network = policy_network(
            state_size=state_size, action_size=action_size).to(device)
        self.policy_optimizer = optim.Adam(self.policy_network.parameters(),
                                           lr=LR)

        self.value_network = value_network(state_size=state_size,
                                           action_size=1).to(device)
        self.value_optimizer = optim.Adam(self.value_network.parameters(),
                                          lr=LR)
        self.epsilon = EPSILON
        self.beta = BETA

        self.reset_memory()
        self.buffer = ReplayBuffer(int(128), 64)
        self.use_gae = use_gae

    def reset_memory(self):
        self.rnn_memory = None

    def policy_loss(self,
                    old_log_probs,
                    states,
                    actions,
                    rewards,
                    epsilon=EPSILON,
                    beta=BETA):

        distribution, _ = self.policy_network(states, None)
        new_log_prob = distribution.log_prob(actions)
        new_probs = torch.exp(new_log_prob)
        ratio = torch.exp(new_log_prob - old_log_probs)

        # clipped function
        clip = torch.clamp(ratio, 1 - epsilon, 1 + epsilon)
        rewards = rewards.reshape(self.n_agents, clip.shape[1], -1)
        clipped_surrogate = torch.min(ratio * rewards, clip * rewards)
        entropy = -(new_probs * old_log_probs +
                    (1.0 - new_probs) * old_log_probs)
        loss = (clipped_surrogate + beta * entropy).mean()

        return loss

    def value_loss(self, states, rewards):
        estimated_value = self.value_network(states).reshape(self.n_agents, -1)
        return (estimated_value - rewards).pow(2).mean(1).mean()

    def act(self, state):
        state = torch.from_numpy(state).float().to(self.device).unsqueeze(1)
        self.policy_network.eval()
        with torch.no_grad():
            action_distribution, self.rnn_memory = self.policy_network(
                state, self.rnn_memory)
        self.policy_network.train()

        action = action_distribution.sample()
        return action.detach().cpu().numpy()

    def action_probs(self, states, actions):
        self.policy_network.eval()
        log_probs = None
        with torch.no_grad():
            distribution, _ = self.policy_network(states, None)
            log_probs = distribution.log_prob(actions).detach()
        self.policy_network.train()
        return log_probs

    def learn(self, trajectory):
        states = torch.from_numpy(trajectory['states']).float().to(self.device)
        actions = torch.from_numpy(trajectory['actions']).float().to(
            self.device)
        rewards = rewards_to_go(trajectory['rewards'], self.n_agents,
                                self.device)
        next_states = torch.from_numpy(trajectory['next_states']).float().to(
            self.device)
        dones = torch.from_numpy(trajectory['dones']).float().to(self.device)
        log_probs = self.action_probs(states, actions)

        policy_signal = None
        if self.use_gae:
            self.buffer.add(states, rewards)
            policy_signal = generalized_advantage_estimate(
                states, rewards, next_states, dones,
                self.value_network).detach()
        else:
            policy_signal = rewards

        # print(policy_signal.shape)
        # policy_signal = (policy_signal - policy_signal.mean()) / (policy_signal.std() + 1e-10)

        # Optimize Policy
        for _ in range(TRAIN_P_ITERS):
            self.policy_optimizer.zero_grad()
            pl = self.policy_loss(log_probs, states, actions, policy_signal,
                                  self.epsilon, self.beta)
            writer.add_scalar('loss/policy', pl.cpu().detach().numpy())
            pl.backward()
            torch.nn.utils.clip_grad_norm_(self.policy_network.parameters(), 1)
            self.policy_optimizer.step()
            del pl

        if self.use_gae:
            # Optimize Value Function
            for _ in range(TRAIN_V_ITERS):
                self.value_optimizer.zero_grad()
                s_, r_ = self.buffer.sample()
                all_rewards = torch.stack(r_)
                r_mean = all_rewards.mean()
                r_std = all_rewards.std() + 1e-10
                losses = []
                for s, r in zip(s_, r_):
                    losses.append(self.value_loss(s, r).mean())
                loss = torch.stack(losses).mean()
                writer.add_scalar('loss/value', loss.cpu().detach().numpy())
                loss.backward()
                self.value_optimizer.step()
                del loss

        self.epsilon *= .999
        self.beta *= .995

        self.reset_memory()
示例#27
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.timestep = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done, agent_num):
        """Save experience in replay memory,
        and use random sample from buffer to learn."""
        self.timestep += 1

        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA, agent_num)

    def act(self, states, eps, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()
        if add_noise:
            actions += eps * self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_num):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]):
                tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # -------------------------- update critic -------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        if agent_num == 0:
            actions_next = torch.cat((actions_next, actions[:, :2]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)

        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # -------------------------- update actor -------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        if agent_num == 0:
            actions_pred = torch.cat((actions_pred, actions[:, :2]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # --------------------- update target networks --------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#28
0
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, seed=0, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, gamma=GAMMA, checkpoint_path='./checkpoints/', pretrained=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.gamma = gamma
        self.checkpoint_path = checkpoint_path

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic)
        
        # If pretrained, load weights
        if pretrained:
            actor_dict = torch.load(os.path.join(self.checkpoint_path,'checkpoint_actor.pth'))
            critic_dict = torch.load(os.path.join(self.checkpoint_path,'checkpoint_critic.pth'))
            self.actor_local.load_state_dict(actor_dict)
            self.actor_target.load_state_dict(actor_dict)
            self.critic_local.load_state_dict(critic_dict)
            self.critic_target.load_state_dict(critic_dict)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device)
    
    def step(self, state, action, reward, next_state, done, tstep=LEARN_EVERY+1):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and tstep % LEARN_EVERY == 0:
            for _ in range(LEARN_NUM):
                experiences = self.memory.sample()
                self.learn(experiences)
            
    def train(self, env, n_episodes=1000):
        """Deep Deterministic Policy Gradient (DDPG) Learning.

        Params
        ======
            env (UnityEnvironment): Unity environment
            n_episodes (int): maximum number of training episodes
        """
        # create checkpoints folder if necessary
        if not os.path.exists(self.checkpoint_path): os.makedirs(self.checkpoint_path)
        # get the default brain
        brain_name = env.brain_names[0]
        env_info = env.reset(train_mode=True)[brain_name]
        num_agents = len(env_info.agents)
        # last 100 scores
        scores_deque = deque(maxlen=100)
        # list containing scores from each episode
        all_scores = []
        # list containing window averaged scores
        avg_scores = []
        # for each episode
        for i_episode in range(1, n_episodes+1):
            # reset environment
            env_info = env.reset(train_mode=True)[brain_name]
            states = env_info.vector_observations
            # reset noise
            self.reset()
            scores = np.zeros(num_agents) 
            # for each timepoint
            t=0
            while True:
                # agent action
                actions = self.act(states)
                # get the next state
                env_info = env.step(actions)[brain_name]
                next_states = env_info.vector_observations
                # get the reward
                rewards = env_info.rewards
                # see if episode has ended
                dones = env_info.local_done
                # step
                for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
                    self.step(state, action, reward, next_state, done, t)
                states = next_states
                scores += rewards
                t+=1
                if np.any(dones):
                    break 
            # save most recent score
            max_score = np.max(scores)
            scores_deque.append(max_score)
            all_scores.append(max_score)
            avg_scores.append(np.mean(scores_deque))
            print('\rEpisode {}\tScore: {:.2f}\tMax Score: {:.2f}'.format(i_episode, max_score, np.mean(scores_deque)), end="")
            if i_episode % 50 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            if np.mean(scores_deque)>=0.5:
                print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
                torch.save(self.actor_local.state_dict(), self.checkpoint_path+'checkpoint_actor.pth')
                torch.save(self.critic_local.state_dict(), self.checkpoint_path+'checkpoint_critic.pth')
                break
            
        return all_scores, avg_scores

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

        self.reset()
        
    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def play(self, env, n_episodes=5):
        """Play a few episodes with trained agents.

        Params
        ======
            env (UnityEnvironment): Unity environment
            n_episodes (int): maximum number of training episodes
        """
        # get the default brain
        brain_name = env.brain_names[0]
        brain = env.brains[brain_name]

        # reset the environment
        env_info = env.reset(train_mode=False)[brain_name]
        num_agents = len(env_info.agents)
        action_size = brain.vector_action_space_size
        state_size = env_info.vector_observations.shape[1]

        # for each episode
        for i_episode in range(1, n_episodes+1):
            env_info = env.reset(train_mode=False)[brain_name]
            states = env_info.vector_observations
            self.reset() # set the noise to zero
            score = np.zeros(num_agents)
            while(True):
                actions = self.act(states, add_noise=False)
                env_info = env.step(actions)[brain_name]
                # get the next states
                next_states = env_info.vector_observations             
                # get the rewards
                rewards = env_info.rewards                             
                # see if the episode has finished for any agent
                dones = env_info.local_done                            

                self.step(states, actions, rewards, next_states, dones)
                states = next_states
                score += rewards
                if np.any(dones):
                    break

            print('Best Score:', np.max(score))    
        env.close()
示例#29
0
def train(target_vars, saver, sess, logger, dataloader, resume_iter, logdir):
    X = target_vars['X']
    X_NOISE = target_vars['X_NOISE']
    train_op = target_vars['train_op']
    energy_pos = target_vars['energy_pos']
    energy_neg = target_vars['energy_neg']
    loss_energy = target_vars['loss_energy']
    loss_ml = target_vars['loss_ml']
    loss_total = target_vars['total_loss']
    gvs = target_vars['gvs']
    x_off = target_vars['x_off']
    x_grad = target_vars['x_grad']
    x_mod = target_vars['x_mod']
    LABEL = target_vars['LABEL']
    HIER_LABEL = target_vars['HIER_LABEL']
    LABEL_POS = target_vars['LABEL_POS']
    eps = target_vars['eps_begin']
    ATTENTION_MASK = target_vars['ATTENTION_MASK']
    attention_mask = target_vars['attention_mask']
    attention_grad = target_vars['attention_grad']

    if FLAGS.prelearn_model or FLAGS.prelearn_model_shape:
        models_pretrain = target_vars['models_pretrain']

    if not FLAGS.comb_mask:
        attention_mask = tf.zeros(1)
        attention_grad = tf.zeros(1)

    if FLAGS.use_attention:
        gamma = weights['atten']['gamma']
    else:
        gamma = tf.zeros(1)


    gvs_dict = dict(gvs)

    log_output = [
        train_op,
        energy_pos,
        energy_neg,
        eps,
        loss_energy,
        loss_ml,
        loss_total,
        x_grad,
        x_off,
        x_mod,
        attention_mask,
        attention_grad,
        *gvs_dict.keys()]
    output = [train_op, x_mod]
    print("log_output ", log_output)

    replay_buffer = ReplayBuffer(10000)
    itr = resume_iter
    x_mod = None
    gd_steps = 1

    dataloader_iterator = iter(dataloader)
    best_inception = 0.0

    for epoch in range(FLAGS.epoch_num):
        for data_corrupt, data, label in dataloader:
            data_corrupt = data_corrupt_init = data_corrupt.numpy()
            data_corrupt_init = data_corrupt.copy()

            data = data.numpy()

            if FLAGS.mixup:
                idx = np.random.permutation(data.shape[0])
                lam = np.random.beta(1, 1, size=(data.shape[0], 1, 1, 1))
                data = data * lam + data[idx] * (1 - lam)

            if FLAGS.replay_batch and (x_mod is not None) and not FLAGS.joint_baseline:
                replay_buffer.add(compress_x_mod(x_mod))

                if len(replay_buffer) > FLAGS.batch_size:
                    replay_batch = replay_buffer.sample(FLAGS.batch_size)
                    replay_batch = decompress_x_mod(replay_batch)
                    replay_mask = (
                        np.random.uniform(
                            0,
                            FLAGS.rescale,
                            FLAGS.batch_size) > FLAGS.keep_ratio)
                    data_corrupt[replay_mask] = replay_batch[replay_mask]

            if FLAGS.pcd:
                if x_mod is not None:
                    data_corrupt = x_mod

            attention_mask = np.random.uniform(-1., 1., (data.shape[0], 64, 64, int(FLAGS.cond_func)))
            feed_dict = {X_NOISE: data_corrupt, X: data, ATTENTION_MASK: attention_mask}

            if FLAGS.joint_baseline:
                feed_dict[target_vars['NOISE']] = np.random.uniform(-1., 1., (data.shape[0], 128))

            if FLAGS.prelearn_model or FLAGS.prelearn_model_shape:
                _, _, labels = zip(*models_pretrain)
                labels = [LABEL, LABEL_POS] + list(labels)
                for lp, l in zip(labels, label):
                    # print("lp, l ", lp, l)
                    # print("l shape ", l.shape)
                    feed_dict[lp] = l
            else:
                label = label.numpy()
                label_init = label.copy()
                if FLAGS.cclass:
                    feed_dict[LABEL] = label
                    feed_dict[LABEL_POS] = label_init

            if FLAGS.heir_mask:
                feed_dict[HIER_LABEL] = label

            if itr % FLAGS.log_interval == 0:
                # print(feed_dict.keys())
                # print(feed_dict)
                _, e_pos, e_neg, eps, loss_e, loss_ml, loss_total, x_grad, x_off, x_mod, attention_mask, attention_grad, * \
                    grads = sess.run(log_output, feed_dict)


                kvs = {}
                kvs['e_pos'] = e_pos.mean()
                kvs['e_pos_std'] = e_pos.std()
                kvs['e_neg'] = e_neg.mean()
                kvs['e_diff'] = kvs['e_pos'] - kvs['e_neg']
                kvs['e_neg_std'] = e_neg.std()
                kvs['loss_e'] = loss_e.mean()
                kvs['loss_ml'] = loss_ml.mean()
                kvs['loss_total'] = loss_total.mean()
                kvs['x_grad'] = np.abs(x_grad).mean()
                kvs['attention_grad'] = np.abs(attention_grad).mean()
                kvs['x_off'] = x_off.mean()
                kvs['iter'] = itr

                for v, k in zip(grads, [v.name for v in gvs_dict.values()]):
                    kvs[k] = np.abs(v).max()

                string = "Obtained a total of "
                for key, value in kvs.items():
                    string += "{}: {}, ".format(key, value)

                if kvs['e_diff'] < -0.5:
                    print("Training is unstable")
                    assert False

                print(string)
                logger.writekvs(kvs)
            else:
                _, x_mod = sess.run(output, feed_dict)

            if itr % FLAGS.save_interval == 0:
                saver.save(
                    sess,
                    osp.join(
                        FLAGS.logdir,
                        FLAGS.exp,
                        'model_{}'.format(itr)))

            if itr > 30000:
                assert False

            # For some reason conditioning on position fails earlier
            # if FLAGS.cond_pos and itr > 30000:
            #     assert False

            if itr % FLAGS.test_interval == 0 and not FLAGS.joint_baseline and FLAGS.dataset != 'celeba':
                try_im = x_mod
                orig_im = data_corrupt.squeeze()
                actual_im = rescale_im(data)

                if not FLAGS.comb_mask:
                    attention_mask = np.random.uniform(-1., 1., (data.shape[0], 64, 64, int(FLAGS.cond_func)))

                orig_im = rescale_im(orig_im)
                try_im = rescale_im(try_im).squeeze()
                attention_mask = rescale_im(attention_mask)

                for i, (im, t_im, actual_im_i, attention_im) in enumerate(
                        zip(orig_im[:20], try_im[:20], actual_im, attention_mask)):
                    im, t_im, actual_im_i, attention_im = im[::-1], t_im[::-1], actual_im_i[::-1], attention_im[::-1]
                    shape = orig_im.shape[1:]
                    new_im = np.zeros((shape[0], shape[1] * (3 + FLAGS.cond_func), *shape[2:]))
                    size = shape[1]
                    new_im[:, :size] = im
                    new_im[:, size:2 * size] = t_im
                    new_im[:, 2 * size: 3 * size] = actual_im_i

                    for i in range(FLAGS.cond_func):
                        new_im[:, (3+i) * size: (4+i) * size] = np.tile(attention_im[:, :, i:i+1], (1, 1, 3))

                    log_image(
                        new_im, logger, 'train_gen_{}'.format(itr), step=i)

                test_im = x_mod

                try:
                    data_corrupt, data, label = next(dataloader_iterator)
                except BaseException:
                    dataloader_iterator = iter(dataloader)
                    data_corrupt, data, label = next(dataloader_iterator)

                data_corrupt = data_corrupt.numpy()


            itr += 1

    saver.save(sess, osp.join(FLAGS.logdir, FLAGS.exp, 'model_{}'.format(itr)))
示例#30
0
def train():
    """
    init dir and log config
    """
    init_cluster_ray()
    base_dir, ckpt_dir, summary_dir = init_dir_and_log()

    kwargs = FLAGS.flag_values_dict()
    kwargs["BASE_DIR"] = base_dir
    kwargs["ckpt_dir"] = ckpt_dir
    act_space = int(FLAGS.act_space)
    kwargs["act_space"] = act_space
    """
    get one seg from rollout worker for dtype and shapes

    :param kwargs rollout worker config
    """
    logging.info('get one seg from Evaluator for dtype and shapes')
    ps = AsyncPS.remote()
    small_data_collector = RolloutCollector(
        server_nums=1,
        ps=ps,
        policy_evaluator_build_func=build_policy_evaluator,
        **kwargs)
    cache_struct_path = '/tmp/%s.pkl' % FLAGS.dir
    structure = fetch_one_structure(small_data_collector,
                                    cache_struct_path=cache_struct_path,
                                    is_head=True)
    del small_data_collector
    """
        init data prefetch thread, prepare_input_pipe
    """
    keys = list(structure.keys())
    dtypes = [structure[k].dtype for k in keys]
    shapes = [structure[k].shape for k in keys]
    segBuffer = tf.queue.RandomShuffleQueue(
        capacity=FLAGS.qsize * FLAGS.batch_size,
        min_after_dequeue=FLAGS.qsize * FLAGS.batch_size // 2,
        dtypes=dtypes,
        shapes=shapes,
        names=keys,
        shared_name="buffer")
    server_nums = FLAGS.nof_evaluator
    nof_server_gpus = FLAGS.nof_server_gpus
    server_nums_refine = server_nums // nof_server_gpus
    data_collector = RolloutCollector(
        server_nums=server_nums_refine,
        ps=ps,
        policy_evaluator_build_func=build_policy_evaluator,
        **kwargs)

    config = tf.ConfigProto(
        allow_soft_placement=True,
        gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=1))
    config.gpu_options.allow_growth = True

    sess = tf.Session(config=config)

    reader = QueueReader(sess=sess,
                         global_queue=segBuffer,
                         data_collector=data_collector,
                         keys=keys,
                         dtypes=dtypes,
                         shapes=shapes)
    reader.daemon = True
    reader.start()

    dequeued = segBuffer.dequeue_many(FLAGS.batch_size)

    # //////////////////////
    if FLAGS.use_demo:
        demo_buffer = build_demo_buffer(keys, 0.9)

        replay_buffer = ReplayBuffer(10000, keys)
        batch_weights = tf.placeholder(tf.float32, shape=[None])

        phs = {
            key: tf.placeholder(dtype=dtype, shape=[None] + list(shape))
            for key, dtype, shape in zip(keys, dtypes, shapes)
        }

        from_where = phs
    else:
        from_where = dequeued
        batch_weights = tf.ones(FLAGS.batch_size)

    # //////////////////////

    prephs, postphs = dict(), dict()
    for k, v in from_where.items():
        if k == "state_in":
            prephs[k] = v
        else:
            prephs[k], postphs[k] = tf.split(
                v, [FLAGS.burn_in, FLAGS.seqlen + FLAGS.n_step], axis=1)
    prekeys = list(prephs.keys())
    postkeys = list(postphs.keys())
    """
        count frame and total steps
    """
    num_frames = tf.get_variable('num_environment_frames',
                                 initializer=tf.zeros_initializer(),
                                 shape=[],
                                 dtype=tf.int32,
                                 trainable=False)
    tf.summary.scalar("frames", num_frames)
    global_step = tf.train.get_or_create_global_step()

    dur_time_tensor = tf.placeholder(dtype=tf.float32)
    tf.summary.scalar('time_per_step', dur_time_tensor)
    """
        set stage_op and build learner
    """
    with tf.device("/gpu"):
        if FLAGS.use_stage:
            area = tf.contrib.staging.StagingArea(
                [prephs[key].dtype for key in prekeys] +
                [postphs[key].dtype
                 for key in postkeys], [prephs[key].shape for key in prekeys] +
                [postphs[key].shape for key in postkeys])
            stage_op = area.put([prephs[key] for key in prekeys] +
                                [postphs[key] for key in postkeys])
            from_stage = area.get()
            predatas = {key: from_stage[i] for i, key in enumerate(prekeys)}
            postdatas = {
                key: from_stage[i + len(prekeys)]
                for i, key in enumerate(postkeys)
            }
        else:
            stage_op = []
            predatas, postdatas = prephs, postphs

        num_frames_and_train, global_step_and_train, init_target_op, priority, beta = build_learner(
            pre=predatas,
            post=postdatas,
            act_space=act_space,
            num_frames=num_frames,
            batch_weights=batch_weights)
    """
        add summary
    """
    summary_ops = tf.summary.merge_all()
    summary_writer = tf.summary.FileWriter(summary_dir, sess.graph)
    """
        initialize and save ckpt
    """
    saver = tf.train.Saver(max_to_keep=100, keep_checkpoint_every_n_hours=6)
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())
    ws = Model.get_ws(sess)
    logging.info('pushing weight to ps')
    ray.get(ps.push.remote(ws))

    saver.save(sess, os.path.join(ckpt_dir, "CKPT"), global_step=global_step)
    """
        step
    """
    total_frames = 0
    sess.run(stage_op)
    sess.run(init_target_op)
    if FLAGS.use_demo:
        dequeued_datas, sample_beta = sess.run([dequeued, beta])
        replay_buffer.add_batch(dequeued_datas, FLAGS.batch_size)

    dur_time = 0
    while total_frames < FLAGS.total_environment_frames:
        start = time.time()

        if FLAGS.use_demo:
            batch_size = np.random.binomial(FLAGS.batch_size - 2, 0.99) + 1
            demo_batch_size = FLAGS.batch_size - batch_size

            datas = replay_buffer.sample(batch_size)
            demo_datas, demo_is_weights, demo_idxes = demo_buffer.sample(
                demo_batch_size, sample_beta)

            fd = {
                phs[k]: np.concatenate([datas[k], demo_datas[k]], axis=0)
                for k in keys
            }
            fd[batch_weights] = np.concatenate(
                [np.ones(batch_size),
                 np.zeros(demo_batch_size)], axis=0)
            fd[dur_time_tensor] = dur_time

            total_frames, gs, summary, _, p, sample_beta, dequeued_datas = sess.run(
                [
                    num_frames_and_train, global_step_and_train, summary_ops,
                    stage_op, priority, beta, dequeued
                ],
                feed_dict=fd)

            demo_buffer.update_priorities(demo_idxes, p[batch_size:])
            replay_buffer.add_batch(dequeued_datas, FLAGS.batch_size)
        else:
            fd = {dur_time_tensor: dur_time}

            total_frames, gs, summary, _ = sess.run([
                num_frames_and_train, global_step_and_train, summary_ops,
                stage_op
            ],
                                                    feed_dict=fd)

        if gs % FLAGS.target_update == 0:
            sess.run(init_target_op)

        if gs % 25 == 0:
            ws = Model.get_ws(sess)
            logging.info('pushing weight to ps')
            try:
                ray.get(ps.push.remote(ws))
            except ray.exceptions.UnreconstructableError as e:
                logging.info(str(e))
            except ray.exceptions.RayError as e:
                logging.info(str(e))

        if gs % 1000 == 0:
            saver.save(sess,
                       os.path.join(ckpt_dir, "CKPT"),
                       global_step=global_step)

        if gs % 1 == 0:
            summary_writer.add_summary(summary, global_step=gs)
            dur_time = time.time() - start
            msg = "Global Step %d, Total Frames %d,  Time Consume %.2f" % (
                gs, total_frames, dur_time)
            logging.info(msg)

    saver.save(sess, os.path.join(ckpt_dir, "CKPT"), global_step=global_step)