示例#1
0
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        np.random.seed(random_seed)  # set the numpy seed

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed, device)
示例#2
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters (0.01)
    def __init__(self, state_size, action_size, random_seed, actor_layers,
                 critic_layers):
        """ Initialize an Agent object.

        Params
        ======
            state_size (int): size of the environment state
            action_size (int): size of the environment action
            random_seed (int): seed for the random
            actor_layers (array[int]): array containing the size of each layer of the actor network
            critic_layers (array[int]): array containing the size of each layer of the critic network
        """

        self.state_size = state_size
        self.action_size = action_size
        self.random_seed = random_seed
        random.seed(random_seed)
        np.random.seed(random_seed)

        # Actor
        print(f'Agent running on {DEVICE}')
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.random_seed, *actor_layers).to(DEVICE)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.random_seed, *actor_layers).to(DEVICE)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.random_seed, *critic_layers).to(DEVICE)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.random_seed,
                                    *critic_layers).to(DEVICE)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise
        self.noise = OrsnteinUhlenbeck(self.action_size, self.random_seed)

        # Replay Buffer
        self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE,
                                   self.random_seed)
示例#4
0
    def __init__(self, env, hp):

        self.env = env
        self.hp = hp
        self.critic = Critic(env.observation_space.shape[0],
                             env.action_space.shape[0], hp)

        self.target_critic = Critic(env.observation_space.shape[0],
                                    env.action_space.shape[0], hp)

        self.actor = Actor(env.observation_space.shape[0],
                           env.action_space.shape[0], env.action_space.high[0],
                           hp)

        self.target_actor = Actor(env.observation_space.shape[0],
                                  env.action_space.shape[0],
                                  env.action_space.high[0], hp)

        self.dataset = ReplayBuffer(self.hp['batch_size'],
                                    self.hp['max_buffer_size'])

        self.noise = OrnsteinUhlenbeckProcess(env.action_space.shape[0],
                                              sigma=self.hp['noise_sigma'])
        self.noise.reset_states()
示例#5
0
    'critic_threshold': 17.5,
    'critic_suffices_required': 1,
    'critic_steps_start': 200,
    'critic_steps_end': 200,
    'actor_steps_start': 1000,
    'actor_steps_end': 1000,
    'batch_size': 256,
    'seed': 123456,
    'replay_fill_threshold': 1.,
    'random_exploration': True,
    'test_iterations': 30,
    'validation_epoch_mod': 3,
}

# configuring the environment
environment = gym.make('Humanoid-v3')
# environment._max_episode_steps = 600

# setting up the training components
agent = AWRAgent
actor = Actor()
critic = Critic()

# training and testing
Training.train((actor, critic),
               agent,
               environment,
               hyper_ps,
               save=True,
               debug_type=DebugType.NONE)
示例#6
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters
        self.score = 0
        self.best_score = -np.inf

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):

        if done:
            reward = self.eval_episode(reward)

        self.add_score(reward)

        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def add_score(self, reward):
        self.score += reward
        if self.best_score < self.score:
            self.best_score = self.score

    def reset_score(self):
        self.score = 0

    def acceptable_episode(self):
        #print(self.task.sim.pose[:3] - self.task.target_pos)
        print(np.linalg.norm(self.task.sim.pose[:3] - self.task.target_pos))

    def eval_episode(self, episode_reward):
        x = self.task.sim.pose[0]
        y = self.task.sim.pose[1]
        z = self.task.sim.pose[2]
        if z <= 0:
            episode_reward -= 35
        elif z >= 145:
            episode_reward -= 25

        if (z >= 90 and z <= 110) and (x >= -20 and x <= 20) and (y >= -20
                                                                  and y <= 20):
            episode_reward += 40
        elif (z >= 65 and z < 130) and (x >= -50 and x <= 50) and (y >= -50 and
                                                                   y <= 50):
            episode_reward += 50
        return episode_reward

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
class DDPGAgent:
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        np.random.seed(random_seed)  # set the numpy seed

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed, device)

        # add OU noise for exploration
        self.noise = OUNoise(action_size, scale=1.0, sigma=.1)

    def reset(self):
        self.noise.reset()

    def step(self, states, actions, rewards, next_states, dones, time_step):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward (for each agent)
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory and every 20 steps
        if len(self.memory) > BATCH_SIZE and time_step % LEARN_STEPS == 0:
            for _ in range(
                    N_UPDATES):  # generate n experiences and realize n updates
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, states, epsilon=0.0, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:  # add a noise (based on normal distribution) to exploration
            actions += self.noise.noise() * epsilon

        return np.clip(actions, -1, 1)

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        self.__update_critic_local(actions, dones, gamma, next_states, rewards,
                                   states)
        self.__update_actor_local(states)

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def __update_critic_local(self, actions, dones, gamma, next_states,
                              rewards, states):
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

    def __update_actor_local(self, states):
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def network_summary(self):
        print('- Actor Summary (both local and target): ')
        self.actor_local.to(device).summary()

        print('- Critic Summary (both local and target): ')
        self.actor_local.to(device).summary()

    def save(self,
             checkpoint_actor_name='checkpoint_actor',
             checkpoint_critic_name='checkpoint_critic'):
        """Save the actor and critic network weights"""
        torch.save(self.actor_local.state_dict(),
                   path_result_folder(f'{checkpoint_actor_name}.pth'))
        torch.save(self.critic_local.state_dict(),
                   path_result_folder(f'{checkpoint_critic_name}.pth'))

    @staticmethod
    def load(env: UnityEnvironment,
             random_seed=0,
             checkpoint_actor_name='checkpoint_actor',
             checkpoint_critic_name='checkpoint_critic'):
        """Load the actor and critic network weights"""
        # get the default brain
        brain_name = env.brain_names[0]
        brain = env.brains[brain_name]

        env_info = env.reset(train_mode=True)[brain_name]
        state_size = len(env_info.vector_observations[0])
        action_size = brain.vector_action_space_size

        loaded_agent = DDPGAgent(state_size, action_size, random_seed)
        loaded_agent.actor_local.load_state_dict(
            torch.load(path_result_folder(f'{checkpoint_actor_name}.pth')))
        loaded_agent.critic_local.load_state_dict(
            torch.load(path_result_folder(f'{checkpoint_critic_name}.pth')))
        return loaded_agent
示例#8
0
def main():
    env = DialogEnvironment()
    experiment_name = args.logdir.split('/')[1] #model name

    torch.manual_seed(args.seed)

    #TODO
    actor = Actor(hidden_size=args.hidden_size,num_layers=args.num_layers,device='cuda',input_size=args.input_size,output_size=args.input_size)
    critic = Critic(hidden_size=args.hidden_size,num_layers=args.num_layers,input_size=args.input_size,seq_len=args.seq_len)
    discrim = Discriminator(hidden_size=args.hidden_size,num_layers=args.hidden_size,input_size=args.input_size,seq_len=args.seq_len)
    
    actor.to(device), critic.to(device), discrim.to(device)
    
    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate) 
    discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate)

    # load demonstrations

    writer = SummaryWriter(args.logdir)

    if args.load_model is not None: #TODO
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        discrim.load_state_dict(ckpt['discrim'])


    
    episodes = 0
    train_discrim_flag = True

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []
        similarity_scores = []
        while steps < args.total_sample_size: 
            scores = []
            similarity_scores = []
            state, expert_action, raw_state, raw_expert_action = env.reset()
            score = 0
            similarity_score = 0
            state = state[:args.seq_len,:]
            expert_action = expert_action[:args.seq_len,:]
            state = state.to(device)
            expert_action = expert_action.to(device)
            for _ in range(10000): 

                steps += 1

                mu, std = actor(state.resize(1,args.seq_len,args.input_size)) #TODO: gotta be a better way to resize. 
                action = get_action(mu.cpu(), std.cpu())[0]
                for i in range(5):
                    emb_sum = expert_action[i,:].sum().cpu().item()
                    if emb_sum == 0:
                       # print(i)
                        action[i:,:] = 0 # manual padding
                        break

                done= env.step(action)
                irl_reward = get_reward(discrim, state, action, args)
                if done:
                    mask = 0
                else:
                    mask = 1


                memory.append([state, torch.from_numpy(action).to(device), irl_reward, mask,expert_action])
                score += irl_reward
                similarity_score += get_cosine_sim(expert=expert_action,action=action.squeeze(),seq_len=5)
                #print(get_cosine_sim(s1=expert_action,s2=action.squeeze(),seq_len=5),'sim')
                if done:
                    break

            episodes += 1
            scores.append(score)
            similarity_scores.append(similarity_score)

        score_avg = np.mean(scores)
        similarity_score_avg = np.mean(similarity_scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        print('{}:: {} episode similarity score is {:.2f}'.format(iter, episodes, similarity_score_avg))

        actor.train(), critic.train(), discrim.train()
        if train_discrim_flag:
            expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, args) 
            print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100))
            writer.add_scalar('log/expert_acc', float(expert_acc), iter) #logg
            writer.add_scalar('log/learner_acc', float(learner_acc), iter) #logg
            writer.add_scalar('log/avg_acc', float(learner_acc + expert_acc)/2, iter) #logg
            if args.suspend_accu_exp is not None: #only if not None do we check.
                if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen:
                    train_discrim_flag = False

        train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args)
        writer.add_scalar('log/score', float(score_avg), iter)
        writer.add_scalar('log/similarity_score', float(similarity_score_avg), iter)
        writer.add_text('log/raw_state', raw_state[0],iter)
        raw_action = get_raw_action(action) #TODO
        writer.add_text('log/raw_action', raw_action,iter)
        writer.add_text('log/raw_expert_action', raw_expert_action,iter)

        if iter % 100:
            score_avg = int(score_avg)
            # Open a file with access mode 'a'
            file_object = open(experiment_name+'.txt', 'a')

            result_str = str(iter) + '|' + raw_state[0] + '|' + raw_action + '|' + raw_expert_action + '\n'
            # Append at the end of file
            file_object.write(result_str)
            # Close the file
            file_object.close()

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, experiment_name + '_ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'discrim': discrim.state_dict(),
                'args': args,
                'score': score_avg,
            }, filename=ckpt_path)
示例#9
0
def train_ppo (actor, env, epoch_nb, rollout_per_epoch, rollout_len, train_step_per_epoch, init_log_std, model_save_interval, adr_test_prob, tensorboard_path):
	
	mpi_role = nodes.mpi_role
	proc_num = nodes.proc_num
	pnid = nodes.pnid
	
	import os
	import time
	
	from ppo import PPO
	from models.critic import Critic
	
	USE_ADR = hasattr(env, 'adr') and adr_test_prob > 1e-7
	
	if mpi_role == 'main':
		os.makedirs(tensorboard_path)
		
		critic = Critic(env)
		
		trainer = PPO(env, actor, critic, tensorboard_path, init_log_std=init_log_std)
		trainer.model_save_interval = model_save_interval
		
		start_time = time.time()
		
		for n in range(epoch_nb):
			# send the network weights
			# and get the latest rollouts
			msg = {	pnid+"weights" : warehouse.Entry(action="set", value=trainer.get_weights()),
					pnid+"adr" : warehouse.Entry(action="get", value=None),
					pnid+"s" : warehouse.Entry(action="get_l", value=rollout_per_epoch),
					pnid+"a" : warehouse.Entry(action="get_l", value=rollout_per_epoch),
					pnid+"r" : warehouse.Entry(action="get_l", value=rollout_per_epoch),
					pnid+"neglog" : warehouse.Entry(action="get_l", value=rollout_per_epoch),
					pnid+"mask" : warehouse.Entry(action="get_l", value=rollout_per_epoch),
					"dumped" : warehouse.Entry(action="get", value=None)
					}
			data = warehouse.send(msg)
			all_s = np.concatenate(data[pnid+"s"].value, axis=0)
			all_a = np.concatenate(data[pnid+"a"].value, axis=0)
			all_r = np.concatenate(data[pnid+"r"].value, axis=0)
			all_neglog = np.concatenate(data[pnid+"neglog"].value, axis=0)
			all_masks = np.concatenate(data[pnid+"mask"].value, axis=0)
			dumped_rollout_nb = data["dumped"].value
			
			if USE_ADR:
				env.adr.update(data[pnid+"adr"].value)
				env.adr.log()
			
			# update the network weights
			all_last_values, all_gae, all_new_value = trainer.calc_gae(all_s, all_r, all_masks)
			trainer.train_networks(n, all_s, all_a, all_r, all_neglog, all_masks, train_step_per_epoch, all_last_values, all_gae, all_new_value)
			
			#debug
			n_rollouts = all_s.shape[0]
			cur_rollout_len = all_s.shape[1]
			print("Epoch {} :".format(n), flush=True)
			#dumped_rollout_nb = "?"
			print("Loaded {} rollouts for training while dumping {}.".format(n_rollouts, dumped_rollout_nb), flush=True)
			dt = time.time() - start_time
			start_time = time.time()
			if dt > 0:
				print("fps : {}".format(n_rollouts*cur_rollout_len/dt), flush=True)
			print("mean_rew : {}".format(np.sum(all_r * all_masks)/np.sum(all_masks)), flush=True)
			
			if USE_ADR:
				env.adr.save()
				
	elif mpi_role == 'worker':
		trainer = PPO(env, actor, Critic(env), init_log_std=init_log_std)
		
		msg = {	pnid+"weights" : warehouse.Entry(action="get", value=None),
				pnid+"adr" : warehouse.Entry(action="set", value={}),
				"proc_num" : warehouse.Entry(action="get", value=None)}
		data = warehouse.send(msg)
		
		while proc_num >= data["proc_num"].value and not warehouse.is_work_done:
			test_adr = USE_ADR and np.random.random() < adr_test_prob
			
			env.test_adr = test_adr
			
			trainer.set_weights (data[pnid+"weights"].value)
			
			if test_adr:
				# simulate rollout
				all_s, all_a, all_r, all_neglog, all_mask = trainer.get_rollout(env.adr_rollout_len)
				
				msg = {	pnid+"adr" : warehouse.Entry(action="update", value=env.adr.get_msg()),
						pnid+"weights" : warehouse.Entry(action="get", value=None),
						"proc_num" : warehouse.Entry(action="get", value=None)}
			else:
				# simulate rollout
				all_s, all_a, all_r, all_neglog, all_mask = trainer.get_rollout(rollout_len)
				
				# send rollout back to warehouse
				# and get network weights to update actor
				msg = {	pnid+"s" : warehouse.Entry(action="add", value=all_s),
						pnid+"a" : warehouse.Entry(action="add", value=all_a),
						pnid+"r" : warehouse.Entry(action="add", value=all_r),
						pnid+"neglog" : warehouse.Entry(action="add", value=all_neglog),
						pnid+"mask" : warehouse.Entry(action="add", value=all_mask),
						pnid+"weights" : warehouse.Entry(action="get", value=None),
						pnid+"adr" : warehouse.Entry(action="get", value=None), 
						"proc_num" : warehouse.Entry(action="get", value=None)}
				
			data = warehouse.send(msg)
			
			if USE_ADR:
				env.adr.update(data[pnid+"adr"].value)
class Agent():
    """ Interacts with and learn from the environment """
    def __init__(self, state_size, action_size, random_seed, actor_layers,
                 critic_layers):
        """ Initialize an Agent object.

        Params
        ======
            state_size (int): size of the environment state
            action_size (int): size of the environment action
            random_seed (int): seed for the random
            actor_layers (array[int]): array containing the size of each layer of the actor network
            critic_layers (array[int]): array containing the size of each layer of the critic network
        """

        self.state_size = state_size
        self.action_size = action_size
        self.random_seed = random_seed
        random.seed(random_seed)
        np.random.seed(random_seed)

        # Actor
        print(f'Agent running on {DEVICE}')
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.random_seed, *actor_layers).to(DEVICE)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.random_seed, *actor_layers).to(DEVICE)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.random_seed, *critic_layers).to(DEVICE)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.random_seed,
                                    *critic_layers).to(DEVICE)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise
        self.noise = OrsnteinUhlenbeck(self.action_size, self.random_seed)

        # Replay Buffer
        self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE,
                                   self.random_seed)

    def step(self, states, actions, rewards, next_states, dones, time_step):
        """ Save experience in replay memory, and use random sample from buffer to learn """
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn only if there is enough samples on memory
        if len(self.memory) > BATCH_SIZE and time_step % LEARN_STEPS == 0:
            for _ in range(N_UPDATES):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True, epsilon=1.0):
        """ Returns actions for given state as per current policy """
        state = torch.from_numpy(state).float().to(DEVICE)
        self.actor_local.eval()

        with torch.no_grad():
            actions = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            # actions += self.noise.sample()
            actions += np.random.normal(0, .3) * epsilon

        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """ Update policy and value parameters using given batch of experience tuples
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # Critic update
        actions_next = self.actor_target(next_states)
        q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * q_targets_next * (1 - dones))

        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # Actor update
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update weights
        self.soft_update(self.actor_local, self.actor_target, TAU)
        self.soft_update(self.critic_local, self.critic_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """ Soft update model parameters
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will copied from
            target_model (PyTorch model): weights will copied to
            tau (float): interpolation parameter
        """

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1 - tau) * target_param.data)
示例#11
0
class DDPGAgent():
    def __init__(self, env, hp):

        self.env = env
        self.hp = hp
        self.critic = Critic(env.observation_space.shape[0],
                             env.action_space.shape[0], hp)

        self.target_critic = Critic(env.observation_space.shape[0],
                                    env.action_space.shape[0], hp)

        self.actor = Actor(env.observation_space.shape[0],
                           env.action_space.shape[0], env.action_space.high[0],
                           hp)

        self.target_actor = Actor(env.observation_space.shape[0],
                                  env.action_space.shape[0],
                                  env.action_space.high[0], hp)

        self.dataset = ReplayBuffer(self.hp['batch_size'],
                                    self.hp['max_buffer_size'])

        self.noise = OrnsteinUhlenbeckProcess(env.action_space.shape[0],
                                              sigma=self.hp['noise_sigma'])
        self.noise.reset_states()

    def take_action(self, state, greedy=False):

        state = Variable(torch.from_numpy(state)).float()
        action = self.actor.predict(state)
        if greedy:
            return action.detach().numpy()

        return action.detach().numpy() \
            + (self.noise.sample() * self.env.action_space.high[0])

    def collect(self, n_episodes, max_episodes):

        state = self.env.reset()
        reward_list = []

        for _ in range(n_episodes):
            reward = 0
            for step in range(max_episodes):
                action = self.take_action(state, greedy=True)
                s_next, r, done, _ = self.env.step(action)
                state = s_next
                reward += r
                if done:
                    break

            reward_list.append(reward)
            state = self.env.reset()

        return np.mean(reward_list)

    def buffer_update(self, sample):

        self.dataset.add_sample(sample)

    def _critic_update(self, batch):

        s = batch[0]
        a = batch[1]
        r = batch[2]
        s_next = batch[3]
        done = batch[4]
        target_actions = self.target_actor.predict(s_next)
        Q_val = self.target_critic.predict(s_next, target_actions)
        y_target = r + done * (self.hp['gamma'] * Q_val)
        #y_target2 = r + self.hp['gamma'] * Q_val
        #print(y_target!=y_target2,done)
        y_pred = self.critic.predict(s, a)
        self.critic.train(y_pred, y_target)

    def _actor_update(self, batch):

        s = batch[0]
        pred_a = self.actor.predict(s)
        loss = torch.mean(-self.critic.predict(s, pred_a))
        self.actor.train(loss)

    def update(self):

        if self.dataset.length < self.hp['batch_size']:
            return
        batch = self.dataset.get_batch()

        self._critic_update(batch)
        self._actor_update(batch)
        self._target_update(self.hp['tau'], self.target_critic, self.critic)
        self._target_update(self.hp['tau'], self.target_actor, self.actor)

    def _target_update(self, tau, target_network, network):
        for target_param, param in zip(target_network.parameters(),
                                       network.parameters()):
            target_param.data.copy_(tau * param.data + target_param.data *
                                    (1.0 - tau))

    def save_models(self, episode):

        torch.save(self.target_actor.state_dict(),
                   './trained_models/' + str(episode) + 'actor.pt')
        torch.save(self.target_critic.state_dict(),
                   './trained_models/' + str(episode) + 'critic.pt')
        print('Models Saved!')

    def load_models(self, path):

        self.actor.load_state_dict(torch.load(path + 'actor.pt'))
        self.critic.load_state_dict(torch.load(path + 'critic.pt'))
        self._target_update(1, self.target_actor, self.actor)
        self._target_update(1, self.target_critic, self.critic)
        print('Models Loaded!')