예제 #1
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    discrim = Discriminator(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate) 
    discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate)
    
    # load demonstrations
    expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb"))
    demonstrations = np.array(expert_demo)
    print("demonstrations.shape", demonstrations.shape)
    
    writer = SummaryWriter(args.logdir)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        discrim.load_state_dict(ckpt['discrim'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0
    train_discrim_flag = True

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size: 
            state = env.reset()
            score = 0

            state = running_state(state)
            
            for _ in range(10000): 
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)
                irl_reward = get_reward(discrim, state, action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, irl_reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train(), discrim.train()
        if train_discrim_flag:
            expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args)
            print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100))
            if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen:
                train_discrim_flag = False
        train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'discrim': discrim.state_dict(),
                'z_filter_n':running_state.rs.n,
                'z_filter_m': running_state.rs.mean,
                'z_filter_s': running_state.rs.sum_square,
                'args': args,
                'score': score_avg
            }, filename=ckpt_path)
예제 #2
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic1_local = Critic(state_size, action_size).to(device)
        self.critic1_target = Critic(state_size, action_size).to(device)
        self.critic1_optimizer = optim.Adam(self.critic1_local.parameters(),
                                            lr=LR_CRITIC,
                                            weight_decay=WEIGHT_DECAY)

        self.critic2_local = Critic(state_size, action_size).to(device)
        self.critic2_target = Critic(state_size, action_size).to(device)
        self.critic2_optimizer = optim.Adam(self.critic2_local.parameters(),
                                            lr=LR_CRITIC,
                                            weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size)

        # Replay memory
        self.memory = PER(BUFFER_SIZE)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory."""
        # Set reward as initial priority, see:
        #   https://jaromiru.com/2016/11/07/lets-make-a-dqn-double-learning-and-prioritized-experience-replay/
        self.memory.add((state, action, reward, next_state, done), reward)

    def act(self, state):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        action += self.noise.sample()
        return np.clip(action, -1., 1.)

    def reset(self):
        self.noise.reset()

    def mse(self, expected, targets, is_weights):
        """Custom loss function that takes into account the importance-sampling weights."""
        td_error = expected - targets
        weighted_squared_error = is_weights * td_error * td_error
        return torch.sum(weighted_squared_error) / torch.numel(
            weighted_squared_error)

    def learn(self):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        """
        for i in range(1, LEARN_BATCH + 1):
            idxs, experiences, is_weights = self.memory.sample(BATCH_SIZE)

            states = torch.from_numpy(
                np.vstack([e[0] for e in experiences
                           if e is not None])).float().to(device)
            actions = torch.from_numpy(
                np.vstack([e[1] for e in experiences
                           if e is not None])).float().to(device)
            rewards = torch.from_numpy(
                np.vstack([e[2] for e in experiences
                           if e is not None])).float().to(device)
            next_states = torch.from_numpy(
                np.vstack([e[3] for e in experiences
                           if e is not None])).float().to(device)
            dones = torch.from_numpy(
                np.vstack([e[4] for e in experiences if e is not None
                           ]).astype(np.uint8)).float().to(device)

            is_weights = torch.from_numpy(is_weights).float().to(device)

            # ---------------------------- update critic ---------------------------- #
            # Target Policy Smoothing Regularization: add a small amount of clipped random noises to the selected action
            if POLICY_NOISE > 0.0:
                noise = torch.empty_like(actions).data.normal_(
                    0, POLICY_NOISE).to(device)
                noise = noise.clamp(-POLICY_NOISE_CLIP, POLICY_NOISE_CLIP)
                # Get predicted next-state actions and Q values from target models
                actions_next = (self.actor_target(next_states) + noise).clamp(
                    -1., 1.)
            else:
                # Get predicted next-state actions and Q values from target models
                actions_next = self.actor_target(next_states)

            # Error Mitigation
            Q_targets_next = torch.min(\
                self.critic1_target(next_states, actions_next), \
                self.critic2_target(next_states, actions_next)).detach()

            # Compute Q targets for current states (y_i)
            Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

            # Compute critic1 loss
            Q_expected = self.critic1_local(states, actions)
            errors1 = np.abs((Q_expected - Q_targets).detach().cpu().numpy())
            critic1_loss = self.mse(Q_expected, Q_targets, is_weights)
            # Minimize the loss
            self.critic1_optimizer.zero_grad()
            critic1_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.critic1_local.parameters(), 1)
            self.critic1_optimizer.step()

            # Update priorities in the replay buffer
            self.memory.batch_update(idxs, errors1)

            # Compute critic2 loss
            Q_expected = self.critic2_local(states, actions)
            critic2_loss = self.mse(Q_expected, Q_targets, is_weights)
            # Minimize the loss
            self.critic2_optimizer.zero_grad()
            critic2_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.critic2_local.parameters(), 1)
            self.critic2_optimizer.step()

            # Delayed Policy Updates
            if i % UPDATE_ACTOR_EVERY == 0:
                # ---------------------------- update actor ---------------------------- #
                # Compute actor loss
                actions_pred = self.actor_local(states)
                actor_loss = -self.critic1_local(states, actions_pred).mean()
                # Minimize the loss
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # ----------------------- update target networks ----------------------- #
                self.soft_update(self.critic1_local, self.critic1_target, TAU)
                self.soft_update(self.critic2_local, self.critic2_target, TAU)
                self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save_weights(self):
        torch.save(self.actor_local.state_dict(), actor_weights_file)
        torch.save(self.critic1_local.state_dict(), critic1_weights_file)
        torch.save(self.critic2_local.state_dict(), critic2_weights_file)

    def load_weights(self):
        self.actor_local.load_state_dict(torch.load(actor_weights_file))
        self.critic1_local.load_state_dict(torch.load(critic1_weights_file))
        self.critic2_local.load_state_dict(torch.load(critic2_weights_file))
예제 #3
0
class Agent():
    "Single agent no learning algorithm"

    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0):
        """Initialize an Agent object

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            lr_actor (float) : learning rate actor network
            lr_critic (float) : learning rate critic network
            weight_decay (float) : weight decay regularizer
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.noise = OUNoise(action_size, random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

    def act(self, state, add_noise=True):
        "Returns actions for given state as per current policy"
        if not isinstance(state, torch.Tensor):
            state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def load(self, filename, map_location=None):
        "Load weights for actor and critic"
        weights = torch.load(filename, map_location=map_location)
        self.actor_local.load_state_dict(weights['actor'])
        if 'critic' in weights:
            self.critic_local.load_state_dict(weights['critic'])

    def reset(self):
        self.noise.reset()

    def save(self, filename='checkpoint.pth'):
        "Serialize actor and critic weights"
        checkpoint = {
            'actor': self.actor_local.state_dict(),
            'critic': self.critic_local.state_dict()
        }
        torch.save(checkpoint, filename)
예제 #4
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args):
        self.num_actor = 3

        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'use_bn': args.bn
        }
        self.actors = [Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor)]
        self.actor_targets = [Actor(self.nb_status, self.nb_actions) for _ in
                              range(self.num_actor)]
        self.actor_optims = [Adam(self.actors[i].parameters(), lr=args.prate) for i in range(self.num_actor)]

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        for i in range(self.num_actor):
            hard_update(self.actor_targets[i], self.actors[i])  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = rpm(args.rmsize)  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def update_policy(self, train_actor=True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
        next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        next_q_values = 0
        for i in range(self.num_actor):
            next_q_values = next_q_values + self.critic_target([
                to_tensor(next_state_batch, volatile=True),
                self.actor_targets[i](to_tensor(next_state_batch, volatile=True)),
            ])
        # print('batch of picture is ok')
        next_q_values = next_q_values / self.num_actor
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
                         self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()
        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        sum_policy_loss = 0
        for i in range(self.num_actor):
            self.actors[i].zero_grad()

            policy_loss = -self.critic([
                to_tensor(state_batch),
                self.actors[i](to_tensor(state_batch))
            ])

            policy_loss = policy_loss.mean()
            policy_loss.backward()
            if train_actor:
                self.actor_optims[i].step()
            sum_policy_loss += policy_loss

            # Target update
            soft_update(self.actor_targets[i], self.actors[i], self.tau)

        soft_update(self.critic_target, self.critic, self.tau)

        return -sum_policy_loss / self.num_actor, value_loss

    def cuda(self):
        for i in range(self.num_actor):
            self.actors[i].cuda()
            self.actor_targets[i].cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        actions = []
        status = []
        tot_score = []
        for i in range(self.num_actor):
            action = to_numpy(self.actors[i](to_tensor(np.array([s_t]), volatile=True))).squeeze(0)
            noise_level = noise_level * max(self.epsilon, 0)
            action = action + self.random_process.sample() * noise_level
            status.append(s_t)
            actions.append(action)
            tot_score.append(0.)

        scores = self.critic([to_tensor(np.array(status), volatile=True), to_tensor(np.array(actions), volatile=True)])
        for j in range(self.num_actor):
            tot_score[j] += scores.data[j][0]
        best = np.array(tot_score).argmax()

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = actions[best]
        return actions[best]

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=0):
        if output is None: return
        for i in range(self.num_actor):
            actor = self.actors[i]
            actor_target = self.actor_targets[i]
            actor.load_state_dict(
                torch.load('{}/actor{}_{}.pkl'.format(output, num, i))
            )
            actor_target.load_state_dict(
                torch.load('{}/actor{}_{}.pkl'.format(output, num, i))
            )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            for i in range(self.num_actor):
                self.actors[i].cpu()
            self.critic.cpu()
        for i in range(self.num_actor):
            torch.save(
                self.actors[i].state_dict(),
                '{}/actor{}_{}.pkl'.format(output, num, i)
            )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            for i in range(self.num_actor):
                self.actors[i].cuda()
            self.critic.cuda()
예제 #5
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        net_cfg = {
            "hidden1": args.hidden1,
            "hidden2": args.hidden2,
            "init_w": args.init_w,
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(
            self.actor_target, self.actor
        )  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(
            limit=args.rmsize, window_length=args.window_length
        )
        self.random_process = OrnsteinUhlenbeckProcess(
            size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma
        )

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        #
        if USE_CUDA:
            self.cuda()

    def update_policy(self):
        # Sample batch
        (
            state_batch,
            action_batch,
            reward_batch,
            next_state_batch,
            terminal_batch,
        ) = self.memory.sample_and_split(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target(
            [
                to_tensor(next_state_batch, volatile=True),
                self.actor_target(to_tensor(next_state_batch, volatile=True)),
            ]
        )
        # next_q_values.volatile = False

        target_q_batch = (
            to_tensor(reward_batch)
            + self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values
        )

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch), self.actor(to_tensor(state_batch))]
        )

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.0, 1.0, self.nb_actions)
        self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True):
        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
        action += self.is_training * max(self.epsilon, 0) * self.random_process.sample()
        action = np.clip(action, -1.0, 1.0)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None:
            return

        self.actor.load_state_dict(torch.load("{}/actor.pkl".format(output)))

        self.critic.load_state_dict(torch.load("{}/critic.pkl".format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), "{}/actor.pkl".format(output))
        torch.save(self.critic.state_dict(), "{}/critic.pkl".format(output))

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
예제 #6
0
class ddpg_Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, env, config):
        """Initialize an Agent object.
        
        Params
        ======
            env : environment to be handled
            config : configuration given a variety of parameters
        """
        self.env = env
        self.config = config

        # set parameter for ML
        self.set_parameters(config)
        # Q-Network
        self.create_networks()
        # Noise process
        self.noise = OUNoise(self.action_size, self.seed)
        # Replay memory
        self.memory = ReplayBuffer(self.action_size, self.buffer_size, self.batch_size, self.seed)
    
    def set_parameters(self, config):
        # Base agent parameters
        self.gamma = config['gamma']                    # discount factor 
        self.tau = config['tau']
        self.max_episodes = config['max_episodes']      # max numbers of episdoes to train
        self.env_file_name = config['env_file_name']    # name and path for env app
        self.brain_name = config['brain_name']          # name for env brain used in step
        self.num_agents = config['num_agents']
        self.state_size = config['state_size']
        self.action_size = config['action_size']
        self.hidden_size = config['hidden_size']
        self.buffer_size = config['buffer_size']
        self.batch_size = config['batch_size']
        self.dropout = config['dropout']
        self.critic_learning_rate = config['critic_learning_rate']
        self.actor_learning_rate = config['actor_learning_rate']
        self.seed = (config['seed'])
        self.noise_scale = 1
        self.noise_sigma = 0.1
        # Some debug flags
        self.DoDebugEpisodeLists = False        
        
    def create_networks(self):
        # Actor Network (local & Target Network)
        self.actor_local = Actor(self.state_size, self.hidden_size, self.action_size, self.seed).to(device)
        self.actor_target = Actor(self.state_size, self.hidden_size, self.action_size, self.seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.actor_learning_rate)

        # Critic Network (local & Target Network)
        self.critic_local = Critic(self.state_size, self.hidden_size, self.action_size, self.seed, self.dropout).to(device)
        self.critic_target = Critic(self.state_size, self.hidden_size, self.action_size, self.seed, self.dropout).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.critic_learning_rate)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        # print('step : Next States : ',next_state.shape)
        self.memory.add(state, action, reward, next_state, done)
        # print('New step added to memory, length : ',len(self.memory))

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)
            
    def update_noise_scale(self, cur_reward, scale_min = 0.2, scale_noise=False):
        """ If scale_noise == True the self.noise_scale will be decreased in relation to rewards
            Currently hand coded  as rewards go up noise_scale will go down from 1 to scale_min"""
        
        if scale_noise:
            rewlow = 2 # below rewlow noise_scale is 1 from there on it increases linearly down to scale_min + 0.5*(1 - scale_min) until rewhigh is reached
            rewhigh = 10 # above rewhigh noise_scale falls linearly down to scale_min until rewrd = 30 is reached. Beyond 30 it stays at scale_min
            if cur_reward > rewlow:
                if cur_reward < rewhigh:
                    self.noise_scale = (1 - scale_min)*(0.5*(rewhigh-cur_reward)/(rewhigh - rewlow) + 0.5) + scale_min
                else:
                    self.noise_scale = (1 - scale_min)*np.min(0.5*(30-cur_reward)/((30-rewhigh)),0) + scale_min
                    
            print('Updated noise scale to : ',self.noise_scale)
                
        return                    
        

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = ten(state)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise_scale * self.noise.sample()
            # ToDo check if tanh works better
        return np.clip(action, -1, 1)

    def train(self):
        if False:
            filename = 'trained_reacher_a_e100.pth'
            self.load_agent(filename)
        all_rewards = []
        reward_window = deque(maxlen=100)
        print('Running on device : ',device)
        for i_episode in range(self.max_episodes): 
            tic = time.time()
            # Reset the enviroment
            env_info = self.env.reset(train_mode=True)[self.brain_name]
            state = env_info.vector_observations
            total_reward = np.zeros(self.num_agents)
            t = 0
            done = np.zeros(self.num_agents, dtype = bool)

            # loop over episode time steps
            while all(done==False): #  t < self.tmax:
                # act and collect data
                action = self.act(state)
                env_info = self.env.step(action)[self.brain_name]
                next_state = env_info.vector_observations
                reward = np.asarray(env_info.rewards)
                done = np.asarray(env_info.local_done)
                # np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
                # print('Episode {} step {} taken action {} reward {} and done is {}'.format(i_episode,t,action,reward,done))
                # increment stuff
                t += 1
                total_reward += reward
                # Proceed agent step
                self.step(state, action, reward, next_state, done)
                # prepare for next round
                state = next_state
            # while not done
            # keep track of rewards:
            all_rewards.append(np.mean(total_reward))
            reward_window.append(np.mean(total_reward))
            
            # Output Episode info : 
            toc = time.time()
            if (i_episode == 100):
                self.stable_update()
            self.update_noise_scale(np.mean(reward_window))
            if not (i_episode % 25 == 0):
                print('Episode {} || Total Reward : {:6.3f} || average reward : {:6.3f} || Used {:5.3f} seconds, mem : {}'.format(i_episode,np.mean(total_reward),np.mean(reward_window),toc-tic,len(self.memory)))
            else:
                print(Back.RED + 'Episode {} || Total Reward : {:6.3f} || average reward : {:6.3f}'.format(i_episode,np.mean(total_reward),np.mean(reward_window)))
                print(Style.RESET_ALL)
                
            if (i_episode % 50 == 0):
                self.save_agent(i_episode)
        # for i_episode
            
        return all_rewards

    def reset(self):
        self.noise.reset()
        
    def stable_update(self):
        """ Update Hyperparameters which proved more stable """
        self.buffer_size = 400000
        self.memory.enlarge(self.buffer_size)
        self.noise_sigma = 0.05
        self.noise.sigma = 0.05

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            self.gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        # print('learn : Next States : ',next_states.shape)
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # print('learn : Actions : ',actions_next.shape)
        # print('learn : Q_target_next : ',Q_targets_next.shape)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def save_agent(self,i_episode):
        filename = 'trained_reacher_e'+str(i_episode)+'.pth'
        torch.save({
            'critic_local': self.critic_local.state_dict(),
            'critic_target': self.critic_target.state_dict(),
            'actor_local': self.actor_local.state_dict(),
            'actor_target': self.actor_target.state_dict(),
            }, filename)
        
        print('Saved Networks in ',filename)
        return
        
    def load_agent(self,filename):
        savedata = torch.load(filename)
        self.critic_local.load_state_dict(savedata['critic_local'])
        self.critic_target.load_state_dict(savedata['critic_target'])
        self.actor_local.load_state_dict(savedata['actor_local'])
        self.actor_target.load_state_dict(savedata['actor_target'])
        return
예제 #7
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs, ), clip=5)  # huh?
    # oh wow. ZFilter is exactly what I do in capstone project, removing "badtimes"

    print('state size:', num_inputs)
    print('action size:', num_actions)

    #load agent stuff
    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    discrim = Discriminator(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(),
                              lr=args.learning_rate,
                              weight_decay=args.l2_rate)
    discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate)

    # load demonstrations
    expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb"))
    demonstrations = np.array(expert_demo)
    print("demonstrations.shape", demonstrations.shape)

    writer = SummaryWriter(args.logdir)

    #if you aren't starting from scratch, load in this
    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model',
                                       str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        # initialize everything
        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        discrim.load_state_dict(ckpt['discrim'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    # if no old model no worries, start training.
    episodes = 0
    train_discrim_flag = True

    for iter in range(args.max_iter_num):
        # for i total trajectories
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size:
            # sample trajectories  (batch size)
            state = env.reset()
            score = 0

            state = running_state(
                state)  #uh.. again ZFilter related, cleans the state

            for _ in range(10000):
                #run through environment
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(
                    0))  #pass state through actor network
                action = get_action(mu, std)[0]  #compute random action
                next_state, reward, done, _ = env.step(action)  #take a step
                irl_reward = get_reward(
                    discrim, state, action
                )  #infer what the reward of this action is based on discriminator's get reward

                if done:
                    mask = 0
                else:
                    mask = 1  #if done, save this,

                memory.append([state, action, irl_reward, mask])

                next_state = running_state(
                    next_state)  #save cleaned next state
                state = next_state  #and set to current state,

                score += reward  #add total reward

                if done:
                    break
                #actual sampling done here

            episodes += 1
            scores.append(score)

        score_avg = np.mean(scores)  #how this model did,
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes,
                                                       score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)  #logg

        actor.train(), critic.train(), discrim.train()  #now train
        if train_discrim_flag:  #if this batch optimizes discrim/reward,
            # for training the discriminator
            expert_acc, learner_acc = train_discrim(
                discrim, memory, discrim_optim, demonstrations,
                args)  # see comments in train_model.
            print("Expert: %.2f%% | Learner: %.2f%%" %
                  (expert_acc * 100, learner_acc * 100))
            if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen:
                train_discrim_flag = False  #now restart, train policy.
        #for training actor critic
        train_actor_critic(actor, critic, memory, actor_optim, critic_optim,
                           args)  # no output, see comments in train_model

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(), 'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path,
                                     'ckpt_' + str(score_avg) + '.pth.tar')

            save_checkpoint(
                {
                    'actor': actor.state_dict(),
                    'critic': critic.state_dict(),
                    'discrim': discrim.state_dict(),
                    'z_filter_n': running_state.rs.n,
                    'z_filter_m': running_state.rs.mean,
                    'z_filter_s': running_state.rs.sum_square,
                    'args': args,
                    'score': score_avg
                },
                filename=ckpt_path)
예제 #8
0
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay, device, load_progress=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
		
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.device = device

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay)

        if load_progress:
            self.actor_local.load_state_dict(torch.load('./checkpoint_actor.pth'))
            self.actor_target.load_state_dict(torch.load('./checkpoint_actor.pth'))
            self.critic_local.load_state_dict(torch.load('./checkpoint_critic.pth'))
            self.critic_target.load_state_dict(torch.load('./checkpoint_critic.pth'))

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(device, action_size, buffer_size, batch_size, random_seed)
    
    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
		
        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
#        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)		
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
#        torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)		
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)                     

        self.actor_loss = actor_loss.data
        self.critic_loss = critic_loss.data
		
    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
			
    def save_progress(self):
        torch.save(self.actor_local.state_dict(), './checkpoint_actor.pth')      
        torch.save(self.critic_local.state_dict(), './checkpoint_critic.pth')
예제 #9
0
class TD3():
    def __init__(self):
        self.actor = Actor(n_state, n_action, max_action).to(device)
        self.target_actor = Actor(n_state, n_action, max_action).to(device)
        self.target_actor.load_state_dict(self.actor.state_dict())

        self.critic1 = Critic(n_state, n_action).to(device)
        self.target_critic1 = Critic(n_state, n_action).to(device)
        self.target_critic1.load_state_dict(self.critic1.state_dict())

        self.critic2 = Critic(n_state, n_action).to(device)
        self.target_critic2 = Critic(n_state, n_action).to(device)
        self.target_critic2.load_state_dict(self.critic2.state_dict())

        self.memory = replay_memory(memory_size)
        self.Aoptimizer = th.optim.Adam(self.actor.parameters(), lr=lr)
        self.C1optimizer = th.optim.Adam(self.critic1.parameters(), lr=lr)
        self.C2optimizer = th.optim.Adam(self.critic2.parameters(), lr=lr)

    def actor_learn(self, batch):
        b_s = th.FloatTensor(batch[:, 0].tolist()).to(device)
        action = self.actor(b_s)
        loss = -(self.critic1(b_s, action).mean())
        self.Aoptimizer.zero_grad()
        loss.backward()
        self.Aoptimizer.step()

    def critic_learn(self, batch, policy_noise):
        b_s = th.FloatTensor(batch[:, 0].tolist()).to(device)
        b_r = th.FloatTensor(batch[:, 1].tolist()).to(device)
        b_a = th.FloatTensor(batch[:, 2].tolist()).to(device)
        b_s_ = th.FloatTensor(batch[:, 3].tolist()).to(device)
        b_d = th.FloatTensor(batch[:, 4].tolist()).to(device)

        dist = th.distributions.normal.Normal(th.FloatTensor([0]),
                                              th.Tensor([policy_noise]))
        next_action = self.target_actor(b_s_) + th.clamp(dist.sample(), -c,
                                                         c).to(device)
        next_action = th.clamp(next_action, env.action_space.low[0],
                               env.action_space.high[0])
        #print(th.cat((self.target_critic1(b_s_,next_action),self.target_critic2(b_s_,next_action)),dim=1))
        target_q = th.min(self.target_critic1(b_s_, next_action),
                          self.target_critic2(b_s_, next_action))
        #print(target_q)
        for i in range(b_d.shape[0]):
            if b_d[i]:
                target_q[i] = b_r[i]
            else:
                target_q[i] = b_r[i] + gamma * target_q[i]
        #target_q=b_r+gamma*target_q
        eval_q1 = self.critic1(b_s, b_a)
        td_error = eval_q1 - target_q.detach()
        loss = (td_error**2).mean()
        self.C1optimizer.zero_grad()
        loss.backward()
        self.C1optimizer.step()

        eval_q2 = self.critic2(b_s, b_a)
        td_error = eval_q2 - target_q.detach()
        loss = (td_error**2).mean()
        self.C2optimizer.zero_grad()
        loss.backward()
        self.C2optimizer.step()

    def soft_update(self):
        for param, target_param in zip(self.actor.parameters(),
                                       self.target_actor.parameters()):
            target_param.data.copy_(tau * param.data +
                                    (1 - tau) * target_param.data)
        for param, target_param in zip(self.critic1.parameters(),
                                       self.target_critic1.parameters()):
            target_param.data.copy_(tau * param.data +
                                    (1 - tau) * target_param.data)
        for param, target_param in zip(self.critic2.parameters(),
                                       self.target_critic2.parameters()):
            target_param.data.copy_(tau * param.data +
                                    (1 - tau) * target_param.data)
예제 #10
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, id, actor_state_size, actor_action_size,
                 critic_state_size, critic_action_size, device):
        """Initialize an Agent object.
        
        Params
        ======
            id (string): agent's id
            actor_state_size (int): dimension of actor's each state
            actor_action_size (int): dimension of actor's each action
            critic_state_size (int): dimension of critic's each state
            critic_action_size (int): dimension of critic's each action
        """
        self.id = id
        self.actor_state_size = actor_state_size
        self.actor_action_size = actor_action_size
        self.critic_state_size = critic_state_size
        self.critic_action_size = critic_action_size
        self.device = device

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(actor_state_size,
                                 actor_action_size).to(self.device)
        self.actor_target = Actor(actor_state_size,
                                  actor_action_size).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(critic_state_size,
                                   critic_action_size).to(self.device)
        self.critic_target = Critic(critic_state_size,
                                    critic_action_size).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(actor_action_size)

    def act(self, state, noise=0.0, clip=True):
        """Returns actions for given state as per current policy."""
        state = torch.unsqueeze(torch.from_numpy(state).float(),
                                0).detach().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = torch.squeeze(self.actor_local(state).cpu()).data.numpy()
        self.actor_local.train()
        if noise != 0.0:
            action += noise * self.noise.sample()
            #action += noise*np.random.randn(self.actor_action_size)
        if clip:
            return np.clip(action, -1, 1)
        return action

    def reset(self):
        self.noise.reset()

    def learn(self, state, state_full, action, action_full, next_state,
              next_state_full, action_pred_full, action_next_full, reward,
              done):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        """

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        with torch.no_grad():
            Q_targets_next = self.critic_target(next_state_full,
                                                action_next_full)
        # Compute Q targets for current states (y_i)
        Q_targets = reward + (GAMMA * Q_targets_next * (1 - done))
        # Compute critic loss
        Q_expected = self.critic_local(state_full, action_full)
        critic_loss = F.mse_loss(Q_expected, Q_targets.detach())
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward(retain_graph=True)
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actor_loss = -self.critic_local(state_full, action_pred_full).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

        return actor_loss, critic_loss

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(TAU * local_param.data +
                                    (1.0 - TAU) * target_param.data)

    def save_weights(self, episode=None):
        if episode == None:
            prefix = self.id
        else:
            prefix = self.id + '-' + str(episode)
        torch.save(
            self.actor_local.state_dict(),
            weights_file_folder + '/' + prefix + '-' + actor_weights_file)
        torch.save(
            self.critic_local.state_dict(),
            weights_file_folder + '/' + prefix + '-' + critic_weights_file)

    def load_weights(self, episode=None):
        if episode == None:
            prefix = self.id
        else:
            prefix = self.id + '-' + str(episode)
        self.actor_local.load_state_dict(
            torch.load(weights_file_folder + '/' + prefix + '-' +
                       actor_weights_file))
        self.critic_local.load_state_dict(
            torch.load(weights_file_folder + '/' + prefix + '-' +
                       critic_weights_file))
예제 #11
0
class TD3:
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        # self.buffer = deque(maxlen=self.config.max_buff)
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.actor = Actor(self.config.state_dim, self.config.action_dim,
                           self.config.max_action)
        self.actor_target = Actor(self.config.state_dim,
                                  self.config.action_dim,
                                  self.config.max_action)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=self.config.learning_rate)

        self.critic_1 = Critic(self.config.state_dim, self.config.action_dim)
        self.critic_1_target = Critic(self.config.state_dim,
                                      self.config.action_dim)
        self.critic_1_target.load_state_dict(self.critic_1.state_dict())
        self.critic_1_optimizer = Adam(self.critic_1.parameters(),
                                       lr=self.config.learning_rate)

        self.critic_2 = Critic(self.config.state_dim, self.config.action_dim)
        self.critic_2_target = Critic(self.config.state_dim,
                                      self.config.action_dim)
        self.critic_2_target.load_state_dict(self.critic_2.state_dict())
        self.critic_2_optimizer = Adam(self.critic_2.parameters(),
                                       lr=self.config.learning_rate)

        self.MseLoss = nn.MSELoss()

        if self.config.use_cuda:
            self.cuda()

    def act(self, state):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        action = self.actor(state)
        return action.cpu().data.numpy().flatten()  #.detach()

    def learning(self, fr, t):

        for i in range(t):
            state, action_, reward, next_state, done = self.buffer.sample(
                self.config.batch_size)

            state = torch.tensor(state, dtype=torch.float).to(device)
            next_state = torch.tensor(next_state, dtype=torch.float).to(device)
            action = torch.tensor(action_, dtype=torch.float).to(device)
            reward = torch.tensor(reward, dtype=torch.float).reshape(
                (-1, 1)).to(device)
            done = torch.tensor(done, dtype=torch.float).reshape(
                (-1, 1)).to(device)
            # reward = torch.FloatTensor(reward).reshape((self.config.batch_size,1)).to(device)
            # done = torch.FloatTensor(done).reshape((self.config.batch_size,1)).to(device)

            # Select next action according to target policy:
            noise = torch.tensor(action_, dtype=torch.float).data.normal_(
                0, self.config.policy_noise).to(device)
            noise = noise.clamp(-self.config.noise_clip,
                                self.config.noise_clip)
            next_action = (self.actor_target(next_state) + noise)
            next_action = next_action.clamp(-self.config.max_action,
                                            self.config.max_action)

            # Compute target Q-value:
            target_Q1 = self.critic_1_target(next_state, next_action)
            target_Q2 = self.critic_2_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (
                (1 - done) * self.config.gamma * target_Q).detach()

            # Optimize Critic 1:
            current_Q1 = self.critic_1(state, action)
            loss_Q1 = F.mse_loss(current_Q1, target_Q)
            self.critic_1_optimizer.zero_grad()
            loss_Q1.backward()
            self.critic_1_optimizer.step()

            # Optimize Critic 2:
            current_Q2 = self.critic_2(state, action)
            loss_Q2 = F.mse_loss(current_Q2, target_Q)
            self.critic_2_optimizer.zero_grad()
            loss_Q2.backward()
            self.critic_2_optimizer.step()

            # Delayed policy updates:
            if i % self.config.policy_delay == 0:
                # Compute actor loss:
                actor_loss = -self.critic_1(state, self.actor(state)).mean()

                # Optimize the actor
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # Polyak averaging update:
                for param, target_param in zip(self.actor.parameters(),
                                               self.actor_target.parameters()):
                    target_param.data.copy_(
                        (self.config.polyak * target_param.data) +
                        ((1 - self.config.polyak) * param.data))

                for param, target_param in zip(
                        self.critic_1.parameters(),
                        self.critic_1_target.parameters()):
                    target_param.data.copy_(
                        (self.config.polyak * target_param.data) +
                        ((1 - self.config.polyak) * param.data))

                for param, target_param in zip(
                        self.critic_2.parameters(),
                        self.critic_2_target.parameters()):
                    target_param.data.copy_(
                        (self.config.polyak * target_param.data) +
                        ((1 - self.config.polyak) * param.data))

    def cuda(self):
        self.actor.to(device)
        self.actor_target.to(device)
        self.critic_1.to(device)
        self.critic_1_target.to(device)
        self.critic_2.to(device)
        self.critic_2_target.to(device)

    def load_weights(self, model_path):
        policy = torch.load(model_path)
        if 'actor' in policy:
            self.actor.load_state_dict(policy['actor'])
        else:
            self.actor.load_state_dict(policy)

    def save_model(self, output, name=''):
        torch.save(self.actor.state_dict(), '%s/actor_%s.pkl' % (output, name))

    def save_config(self, output):
        with open(output + '/config.txt', 'w') as f:
            attr_val = get_class_attr_val(self.config)
            for k, v in attr_val.items():
                f.write(str(k) + " = " + str(v) + "\n")

    def save_checkpoint(self, fr, output):
        checkpath = output + '/checkpoint_policy'
        os.makedirs(checkpath, exist_ok=True)
        torch.save(
            {
                'frames': fr,
                'actor': self.actor.state_dict(),
                'critic_1': self.critic_1.state_dict(),
                'critic_2': self.critic_2.state_dict(),
            }, '%s/checkpoint_fr_%d.tar' % (checkpath, fr))

    def load_checkpoint(self, model_path):
        checkpoint = torch.load(model_path)
        fr = checkpoint['frames']
        self.actor.load_state_dict(checkpoint['actor'])
        self.critic_1.load_state_dict(checkpoint['critic_1'])
        self.critic_2.load_state_dict(checkpoint['critic_2'])
        return fr
class DDPG():

    def __init__(self, state_size, action_size, CER=False, random_seed=23,
                 fc1_units=96, fc2_units=96, epsilon=1.0, lr_actor=1e-3,
                 lr_critic=1e-3, weight_decay=0):
        self.state_size = state_size
        self.action_size = action_size 
        self.CER = CER
        self.EXPmemory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        self.CERmem = ReplayBuffer(action_size, CER_SIZE, BATCH_SIZE, random_seed)
        self.random_seed = random_seed
        self.fc1_units = fc1_units
        self.fc2_units = fc2_units
        self.state_size = state_size
        self.action_size = action_size        
        self.epsilon = epsilon
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weight_decay
        self.noise = OUNoise(action_size, random_seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        random.seed(random_seed)
 
        self.actor = Actor(self.state_size, self.action_size, self.random_seed,
                           fc1_units=self.fc1_units, fc2_units=self.fc2_units).to(device)
        self.actor_target = Actor(self.state_size, self.action_size, self.random_seed,
                                  fc1_units=self.fc1_units, fc2_units=self.fc2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.lr_actor)

        self.critic = Critic(self.state_size, self.action_size, self.random_seed,
                             fc1_units=self.fc1_units, fc2_units=self.fc2_units).to(device)
        self.critic_target = Critic(self.state_size, self.action_size, self.random_seed,
                                    fc1_units=self.fc1_units, fc2_units=self.fc2_units).to(device)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.lr_critic, 
                                           weight_decay=self.weight_decay)
        # Initialize target and local being the same
        self.hard_copy(self.actor_target, self.actor)
        self.hard_copy(self.critic_target, self.critic)

    def step(self, states, actions, rewards, next_states, dones):
        # Save experience in replay memory
        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states,dones):
            self.EXPmemory.add(state, action, reward, next_state, done)
            if(self.CER):
                self.CERmem.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.EXPmemory) > BATCH_SIZE:
                for _ in range(NUM_UPDATES):
                    experiences = self.EXPmemory.sample()
                    self.learn(experiences, GAMMA)
                if(self.CER):
                    for _ in range(5):
                        experiences = self.CERmem.sample()
                        self.learn(experiences, GAMMA)
            
    def act(self, state, add_noise=True):
        state = torch.from_numpy(state).float().to(device)

        self.actor.eval()
        with torch.no_grad():
            action = self.actor(state).cpu().data.numpy()
        self.actor.train()

        if add_noise: # Add noise for exploration
            action += self.epsilon * self.noise.sample()

        return action

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, tau=1e-3, epsilon_decay=1e-6):        
        states, actions, rewards, next_states, dones = experiences
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.critic(states, actions)
                
    # Critic update        
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1)
        self.critic_optimizer.step()

    # Actor update 
        actions_pred = self.actor(states)
        actor_loss = -self.critic(states, actions_pred).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

   # Targets update
        self.soft_update(self.critic, self.critic_target, tau)
        self.soft_update(self.actor, self.actor_target, tau)

   #  Noise update
        self.epsilon -= epsilon_decay
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):        
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def hard_copy(self, target, source):
        for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(param.data)
    
    def save(self, path):
        torch.save(self.actor.state_dict(), 
                   path+ str(self.fc1_units)+'_'+str(self.fc2_units) + '_actor.pth')
        torch.save(self.critic.state_dict(),
                   path+ str(self.fc1_units)+'_'+str(self.fc2_units) + '_critic.pth')
    
    def load(self, actor_file, critic_file):
        self.actor.load_state_dict(torch.load(actor_file))
        self.critic.load_state_dict(torch.load(critic_file))
        self.hard_copy(self.actor_target, self.actor)
        self.hard_copy(self.critic_target, self.critic)
예제 #13
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.writer = writer
        self.select_time = 0
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'init_method':args.init_method
        }

        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def update_policy(self, train_actor = True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = nn.MSELoss()(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()

        policy_loss = -self.critic([
            to_tensor(state_batch),
            self.actor(to_tensor(state_batch))
        ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time)

        if train_actor:
            self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        self.eval()
        # print(s_t.shape)
        action = to_numpy(
            self.actor(to_tensor(np.array([s_t])))
        ).squeeze(0)
            
        self.train()
        noise_level = noise_level * max(self.epsilon, 0)
        
        action = action * (1 - noise_level) + (self.random_process.sample() * noise_level)
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):        
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            self.actor.cpu()
            self.critic.cpu()
        torch.save(
            self.actor.state_dict(),
            '{}/actor{}.pkl'.format(output, num)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            self.actor.cuda()
            self.critic.cuda()
예제 #14
0
                    break
            scores.append(score)
        score_avg = np.mean(scores)
        print('{} episode score is {:.2f}'.format(episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train()
        train_model(actor, critic, memory, actor_optim, critic_optim)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(), 'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path,
                                     'ckpt_' + str(score_avg) + '.pth.tar')

            save_checkpoint(
                {
                    'actor': actor.state_dict(),
                    'critic': critic.state_dict(),
                    'z_filter_n': running_state.rs.n,
                    'z_filter_m': running_state.rs.mean,
                    'z_filter_s': running_state.rs.sum_square,
                    'args': args,
                    'score': score_avg
                },
                filename=ckpt_path)
예제 #15
0
    print('* (Train) Epoch: {} | G Loss: {:.4f} | C Loss: {:.4f}'.format(
        epoch, g_train_loss, c_train_loss))
    return (g_train_loss, c_train_loss)


train_loader, vocab = load(batch_size, seq_len)
autoencoder = Autoencoder(enc_hidden_dim, dec_hidden_dim, embedding_dim,
                          latent_dim, vocab.size(), dropout, seq_len)
autoencoder.load_state_dict(
    torch.load('autoencoder.th', map_location=lambda x, y: x))
generator = Generator(n_layers, block_dim)
critic = Critic(n_layers, block_dim)

g_optimizer = optim.Adam(generator.parameters(), lr=lr)
c_optimizer = optim.Adam(critic.parameters(), lr=lr)
if cuda:
    autoencoder = autoencoder.cuda()
    generator = generator.cuda()
    critic = critic.cuda()

best_loss = np.inf

for epoch in range(1, epochs + 1):
    g_loss, c_loss = train(epoch)
    loss = g_loss + c_loss
    if loss < best_loss:
        best_loss = loss
        print('Saved')
        torch.save(generator.state_dict(), 'generator.th')
        torch.save(critic.state_dict(), 'critic.th')
예제 #16
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 device,
                 gamma=GAMMA,
                 tau=TAU,
                 lr_actor=LR_ACTOR,
                 lr_critic=LR_CRITIC,
                 random_seed=0):
        """
            Initialize an Agent object.
        :param state_size: size of state
        :param action_size: size of action
        :param num_agents: number of agents
        :param gamma: discount factor
        :param tau: factor for soft update of target parameters
        :param lr_actor: Learning rate of actor
        :param lr_critic: Learning rate of critic
        :param random_seed: Random seed
        :param device: cuda or cpu
        """

        self.device = device
        self.gamma = gamma
        self.tau = tau

        self.num_agents = num_agents

        self.state_size = state_size
        self.action_size = action_size
        self.full_state_size = state_size * num_agents
        self.full_action_size = action_size * num_agents
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, device,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, device,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(self.full_state_size,
                                   self.full_action_size,
                                   device=device,
                                   random_seed=random_seed).to(device)
        self.critic_target = Critic(self.full_state_size,
                                    self.full_action_size,
                                    device=device,
                                    random_seed=random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=0)

        self.noise = OUNoise(action_size, random_seed)

    def save_model(self, agent_number):
        torch.save(self.actor_local.state_dict(),
                   f'models/checkpoint_actor_{agent_number}.pth')
        torch.save(self.critic_local.state_dict(),
                   f'models/checkpoint_critic_{agent_number}.pth')

    def load_model(self, agent_number):
        checkpoint = torch.load(f'models/checkpoint_actor_{agent_number}.pth',
                                map_location=torch.device('cpu'))
        self.actor_local.load_state_dict(checkpoint)

        checkpoint = torch.load(f'models/checkpoint_critic_{agent_number}.pth',
                                map_location=torch.device('cpu'))
        self.critic_local.load_state_dict(checkpoint)

    def act(self, state, noise=0., train=False):
        """Returns actions for given state as per current policy.
        :param state: state as seen from single agent
        """

        if train is True:
            self.actor_local.train()
        else:
            self.actor_local.eval()

        action = self.actor_local(state)
        if noise > 0:
            noise = torch.tensor(noise * self.noise.sample(),
                                 dtype=state.dtype,
                                 device=state.device)
        return action + noise

    def target_act(self, state, noise=0.):
        #self.actor_target.eval()
        # convert to cpu() since noise is in cpu()
        self.actor_target.eval()
        action = self.actor_target(state).cpu()
        if noise > 0.:
            noise = torch.tensor(noise * self.noise.sample(),
                                 dtype=state.dtype,
                                 device=state.device)
        return action + noise

    def update_critic(self, rewards, dones, all_states, all_actions,
                      all_next_states, all_next_actions):
        with torch.no_grad():
            Q_targets_next = self.critic_target(all_next_states,
                                                all_next_actions)
            # Compute Q targets for current states (y_i)
        q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        q_expected = self.critic_local(all_states, all_actions)
        # critic_loss = F.mse_loss(q_expected, q_targets)
        critic_loss = ((q_expected - q_targets.detach())**2).mean()
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

    def update_actor(self, all_states, all_predicted_actions):
        """Update actor network

        :param all_states: all states
        :param all_predicted_actions: all predicted actions
        """
        actor_loss = -self.critic_local(all_states,
                                        all_predicted_actions).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor_optimizer.step()

    def update_targets(self):
        self.soft_update(self.actor_local, self.actor_target, self.tau)
        self.soft_update(self.critic_local, self.critic_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def reset(self):
        self.noise.reset()
예제 #17
0
class Agent(object):
    """
    Interacts with and learns from the environment.
    """

    def __init__(self, state_size, action_size, num_agents,
                 seed=0, buffer_size=int(1e6),
                 actor_lr=1e-4, actor_hidden_sizes=(128, 256), actor_weight_decay=0,
                 critic_lr=1e-4, critic_hidden_sizes=(128, 256, 128), critic_weight_decay=0,
                 batch_size=128, gamma=0.99, tau=1e-3):
        """
        Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents to train
            seed (int): random seed, default value is 0
            buffer_size (int): buffer size of experience memory, default value is 100000

            actor_lr (float): learning rate of actor model, default value is 1e-4
            actor_lr (float): learning rate of actor model, default value is 1e-4
            actor_hidden_sizes (tuple): size of hidden layer of actor model, default value is (128, 256)
            critic_lr (float): learning rate of critic model, default value is 1e-4
            critic_hidden_sizes (tuple): size of hidden layer of critic model, default value is (128, 256, 128)

            batch_size (int): mini-batch size
            gamma (float): discount factor
            tau (float): interpolation parameter
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = seed

        self.batch_size = batch_size  # mini-batch size
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters

        # Actor Network
        self.actor_local = Actor(state_size, action_size, seed,
                                 hidden_units=actor_hidden_sizes).to(DEVICE)
        self.actor_target = Actor(state_size, action_size, seed,
                                  hidden_units=actor_hidden_sizes).to(DEVICE)
        self.actor_target.eval()
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=actor_lr,
                                          weight_decay=actor_weight_decay)

        # Critic Network
        self.critic_local = Critic(state_size, action_size, seed,
                                   hidden_units=critic_hidden_sizes).to(DEVICE)
        self.critic_target = Critic(state_size, action_size, seed,
                                    hidden_units=critic_hidden_sizes).to(DEVICE)
        self.critic_target.eval()
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=critic_lr,
                                           weight_decay=critic_weight_decay)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), seed)

        # Replay memory
        self.memory = ReplyBuffer(buffer_size=buffer_size, seed=seed)

        # copy parameters of the local model to the target model
        self.soft_update(self.critic_local, self.critic_target, 1.)
        self.soft_update(self.actor_local, self.actor_target, 1.)

        self.seed = random.seed(seed)
        np.random.seed(seed)

        self.reset()

    def reset(self):
        self.noise.reset()

    def act(self, state, add_noise=True):
        # actions = np.random.randn(self.num_agents, self.action_size)
        # actions = np.clip(actions, -1, 1)

        state = torch.from_numpy(state).float().to(DEVICE)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def step(self, states, actions, rewards, next_states, dones):
        """
        Save experience in replay memory, and use random sample from buffer to learn.
        """

        # Save experience / reward
        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample(batch_size=self.batch_size)
            self.learn(experiences, self.gamma)

    def learn(self, experiences, gamma, last_action_loss=None):
        """
        Update policy and experiences parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-experiences

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ------- update critic ------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        q_targets = rewards + (gamma * q_targets_next * (1 - dones))
        q_targets = q_targets.detach()

        # Compute critic loss
        q_expected = self.critic_local(states, actions)
        assert q_expected.shape == q_targets.shape
        critic_loss = F.mse_loss(q_expected, q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1.0)  # clip the gradient (Udacity)
        self.critic_optimizer.step()

        # ------- update actor ------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        #  update target networks
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        return actor_loss.item(), critic_loss.item()

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.detach_()
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

    def save(self):
        """
        Save model state
        """
        torch.save(self.actor_local.state_dict(), "checkpoints/checkpoint_actor.pth")
        torch.save(self.actor_target.state_dict(), "checkpoints/checkpoint_actor_target.pth")
        torch.save(self.critic_local.state_dict(), "checkpoints/checkpoint_critic.pth")
        torch.save(self.critic_target.state_dict(), "checkpoints/checkpoint_critic_target.pth")

    def load(self):
        """
        Load model state
        """
        self.actor_local.load_state_dict(torch.load("checkpoints/checkpoint_actor.pth", map_location=lambda storage, loc: storage))
        self.actor_target.load_state_dict(torch.load("checkpoints/checkpoint_actor_target.pth", map_location=lambda storage, loc: storage))
        self.critic_local.load_state_dict(torch.load("checkpoints/checkpoint_critic.pth", map_location=lambda storage, loc: storage))
        self.critic_target.load_state_dict(torch.load("checkpoints/checkpoint_critic_target.pth", map_location=lambda storage, loc: storage))

    def __str__(self):
        return f"{str(self.actor_local)}\n{str(self.critic_local)}"
예제 #18
0
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def load(self, model_dir, agent_id):
        # Load Actor and Critic network weights
        self.actor_local.load_state_dict(
            torch.load(
                os.path.join(model_dir,
                             'agent_{0}_actor.pth'.format(agent_id))))
        self.critic_local.load_state_dict(
            torch.load(
                os.path.join(model_dir,
                             'agent_{0}_critic.pth'.format(agent_id))))

    def save(self, model_dir, agent_id):
        # Save Actor and Critic network weights
        torch.save(
            self.actor_local.state_dict(),
            os.path.join(model_dir, 'agent_{0}_actor.pth'.format(agent_id)))
        torch.save(
            self.critic_local.state_dict(),
            os.path.join(model_dir, 'agent_{0}_critic.pth'.format(agent_id)))
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self):
        self.config = Config()
        random.seed(self.config.seed)

        # Actor Network
        self.actor_local = Actor()
        self.actor_target = Actor()
        local_state_dict = self.actor_local.state_dict()
        self.actor_target.load_state_dict(local_state_dict)
        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(),
            lr=self.config.actor_lr)
        self.actor_lr_scheduler = optim.lr_scheduler.StepLR(
            self.actor_optimizer,
            step_size=self.config.lr_sched_step,
            gamma=self.config.lr_sched_gamma)

        # Critic Network
        self.critic_local = Critic()
        self.critic_target = Critic()
        local_state_dict = self.critic_local.state_dict()
        self.critic_target.load_state_dict(local_state_dict)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.config.critic_lr)
        self.critic_lr_scheduler = optim.lr_scheduler.StepLR(
            self.critic_optimizer,
            step_size=self.config.lr_sched_step,
            gamma=self.config.lr_sched_gamma)

        # Initialize a noise process
        self.noise = OUNoise()

    def soft_update(self):
        """Soft update actor and critic parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        """
        tau = self.config.tau
        for target_param, local_param \
                in zip(self.actor_target.parameters(),
                       self.actor_local.parameters()):
            target_param.data.copy_(
                tau * local_param.data
                + (1.0 - tau) * target_param.data)
        for target_param, local_param \
                in zip(self.critic_target.parameters(),
                       self.critic_local.parameters()):
            target_param.data.copy_(
                tau * local_param.data
                + (1.0 - tau) * target_param.data)

    def act(self, state):
        """Returns actions for given state as per current policy."""
        with torch.no_grad():
            self.actor_local.eval()
            state = torch.from_numpy(state).float()
            state.to(self.config.device)
            action = self.actor_local(state).data.cpu().numpy()
            self.actor_local.train()

        if self.config.noise:
            action += self.noise.sample()
            np.clip(action, a_min=-1, a_max=1, out=action)

        return action

    def lr_step(self):
        self.actor_lr_scheduler.step()
        self.critic_lr_scheduler.step()

    def reset_noise(self):
        self.noise.reset()
예제 #20
0
class Agent():
    def __init__(self, test=False):
        # device
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
        #########################################
        """
        Some hand tune config(for developing)
        """
        self.discrete = False
        self.action_dim = 1
        self.state_dim = 3
        self.batch_size = 100
        self.action_low = -2
        self.action_high = 2
        ##########################################
        self.P_online = Actor(state_dim=self.state_dim,
                              action_size=self.action_dim).to(self.device)
        self.P_target = Actor(state_dim=self.state_dim,
                              action_size=self.action_dim).to(self.device)
        self.P_target.load_state_dict(self.P_online.state_dict())
        self.Q_online = Critic(state_size=self.state_dim,
                               action_size=self.action_dim).to(self.device)
        self.Q_target = Critic(state_size=self.state_dim,
                               action_size=self.action_dim).to(self.device)
        self.Q_target.load_state_dict(self.Q_online.state_dict())
        # discounted reward
        self.gamma = 0.99
        self.eps = 0.25
        # optimizer
        self.q_optimizer = torch.optim.Adam(self.Q_online.parameters(),
                                            lr=1e-3)
        self.p_optimizer = torch.optim.Adam(self.P_online.parameters(),
                                            lr=1e-3)
        # saved rewards and actions
        self.replay_buffer = ReplayBuffer()

        # noise
        self.noise = Noise(DELTA, SIGMA, OU_A, OU_MU)
        # Initialize noise
        self.ou_level = 0.

        self.ep_step = 0

    def act(self, state, test=False):
        if not test:
            with torch.no_grad():
                # boring type casting
                state = ((torch.from_numpy(state)).unsqueeze(0)).float().to(
                    self.device)
                action = self.P_online(state)  # continuous output
                a = action.data.cpu().numpy()
                # if self.ep_step < 200:
                # self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level)
                # a = a + self.ou_level
                if self.discrete:
                    action = np.argmax(a)
                    return a, action
                else:
                    if self.ep_step < 200:
                        self.ou_level = self.noise.ornstein_uhlenbeck_level(
                            self.ou_level)
                    action = np.clip(a + self.ou_level, self.action_low,
                                     self.action_high)
                    return action, action

    def collect_data(self, state, action, reward, next_state, done):
        self.replay_buffer.push(
            torch.from_numpy(state).float().unsqueeze(0),
            torch.from_numpy(action).float(),
            torch.tensor([reward]).float().unsqueeze(0),
            torch.from_numpy(next_state).float().unsqueeze(0),
            torch.tensor([done]).float().unsqueeze(0))

    def clear_data(self):
        raise NotImplementedError("Circular Queue don't need this function")

    def update(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size=self.batch_size, device=self.device)
        # discounted rewards
        # rewards = torch.from_numpy(discount((rewards.view(rewards.shape[0])).cpu().numpy())).float().to(self.device)

        ### debug shape : ok
        #===============================Critic Update===============================
        self.Q_online.train()
        Q = self.Q_online((states, actions))

        with torch.no_grad():  # don't need backprop for target value
            self.Q_target.eval()
            self.P_target.eval()
            target = rewards + self.gamma * (1 - dones) * self.Q_target(
                (next_states, self.P_target(next_states)))
        critic_loss_fn = torch.nn.MSELoss()
        critic_loss = critic_loss_fn(Q, target).mean()
        # update
        self.q_optimizer.zero_grad()
        critic_loss.backward()
        self.q_optimizer.step()
        # print("critic loss", critic_loss.item())

        #===============================Actor Update===============================
        # fix online_critic , update online_actor
        self.Q_online.eval()
        for p in self.Q_online.parameters():
            p.requires_grad = False
        for p in self.P_online.parameters():
            p.requires_grad = True
        policy_loss = -self.Q_online((states, self.P_online(states)))
        policy_loss = policy_loss.mean()
        self.p_optimizer.zero_grad()
        policy_loss.backward()
        self.p_optimizer.step()
        # print("policy loss", policy_loss.item())
        for p in self.Q_online.parameters():
            p.requires_grad = True
        #===============================Target Update===============================
        soft_update(self.Q_target, self.Q_online, tau=1e-3)
        soft_update(self.P_target, self.P_online, tau=1e-3)
        self.eps -= EPSILON_DECAY
        if self.eps <= 0:
            self.eps = 0
예제 #21
0
class TD3(object):
    def __init__(self, env, writer=None):
        """
        Twin Delayed Deep Deterministic Policy Gradient Algorithm(TD3)
        """
        self.env = env
        self.writer = writer

        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        self.max_action = env.action_space.high[0]

        # Randomly initialize network parameter
        self.actor = Actor(state_dim, action_dim).to('cuda')
        self.critic = Critic(state_dim, action_dim).to('cuda')

        # Initialize target network parameter
        self.target_actor = Actor(state_dim, action_dim).to('cuda')
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic = Critic(state_dim, action_dim).to('cuda')
        self.target_critic.load_state_dict(self.critic.state_dict())

        # Replay memory
        self.memory = ReplayMemory(state_dim, action_dim)

        self.gamma = gamma
        self.tau = tau

        # network parameter optimizer
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_lr,
                                           weight_decay=weight_decay)

    def get_action(self, state, initial_act=False):
        if initial_act:
            return self.env.action_space.sample()
        action = self.actor(torch.from_numpy(state).to('cuda', torch.float))
        action = np.random.normal(0, 0.1) + action.detach().cpu().numpy()
        return np.clip(action, -1, 1)

    def store_transition(self, state, action, state_, reward, done):
        self.memory.store_transition(state, action, state_, reward, done)

    def soft_update(self, target_net, net):
        """Target parameters soft update"""
        for target_param, param in zip(target_net.parameters(),
                                       net.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)

    def update(self, time_step, batch_size=64):
        states, actions, states_, rewards, terminals = self.memory.sample(
            batch_size)

        # Update Critic
        with torch.no_grad():
            noise = (torch.randn_like(actions) * policy_noise).clamp(
                -noise_clip, noise_clip)

            actions_ = (self.target_actor(states_) + noise).clamp(
                -self.max_action, self.max_action)

            target_q1, target_q2 = self.target_critic(states_, actions_)
            y = rewards.unsqueeze(1) + terminals.unsqueeze(
                1) * gamma * torch.min(target_q1, target_q2)
        q1, q2 = self.critic(states, actions)
        critic_loss = F.mse_loss(q1, y) + F.mse_loss(q2, y)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        if self.writer and time_step:
            self.writer.add_scalar("loss/critic", critic_loss.item(),
                                   time_step)

        # Delayed Policy Update
        if time_step % policy_freq == 0:
            # Update Actor
            actor_loss = -1 * self.critic.Q1(states, self.actor(states)).mean()
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
            if self.writer:
                self.writer.add_scalar("loss/actor", actor_loss.item(),
                                       time_step)

            # target parameter soft update
            self.soft_update(self.target_actor,
                             self.actor)  # update target actor network
            self.soft_update(self.target_critic,
                             self.critic)  # update target critic network

    def save_model(self, path='models/'):
        torch.save(self.actor.state_dict(), path + 'actor')
        torch.save(self.critic.state_dict(), path + 'critic')
        torch.save(self.target_actor.state_dict(), path + 'target_actor')
        torch.save(self.target_critic.state_dict(), path + 'target_critic')

    def load_model(self, path='models/'):
        self.actor.load_state_dict(torch.load(path + 'actor'))
        self.critic.load_state_dict(torch.load(path + 'critic'))
        self.target_actor.load_state_dict(torch.load(path + 'target_actor'))
        self.target_critic.load_state_dict(torch.load(path + 'target_critic'))
예제 #22
0
class MiADDPG():
    """Multiple independent Agents trained with DDPG

    This class allows to shared experience-buffer and critic network of the
    class::Agent.
    """
    def __init__(self,
                 num_agents,
                 state_size,
                 action_size,
                 random_seed,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0,
                 tau=1e-3,
                 gamma=0.99,
                 batch_size=128,
                 buffer_size=int(1e5),
                 share_critic=True,
                 share_buffer=True):
        """Initialize an multi-agent wrapper

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            tau (float): control soft-update
            gamma (float): discount factor
            batch_size (int): size of training batch
            buffer_size (int) : cap on number of experiences
        """
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.tau = tau
        self.gamma = gamma

        self.agents = [
            Agent(state_size,
                  action_size,
                  random_seed,
                  lr_actor=1e-4,
                  lr_critic=1e-3,
                  weight_decay=0) for i in range(num_agents)
        ]
        self.share_critic = share_critic
        if share_critic:
            self.critic_local = Critic(state_size, action_size,
                                       random_seed).to(device)
            self.critic_target = Critic(state_size, action_size,
                                        random_seed).to(device)
            self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                               lr=lr_critic,
                                               weight_decay=weight_decay)
            for agent in self.agents:
                agent.critic_local = None
                agent.critic_target = None
                agent.critic_optimizer = None

        self.share_buffer = share_buffer
        num_buffer = num_agents
        if share_buffer:
            num_buffer = 2
        self.memory = [
            ReplayBuffer(buffer_size, batch_size) for i in range(num_buffer)
        ]

    def step(self, state, action, reward, next_state, done):
        "Save experience and random sample from buffer to learn"
        # Save experience / reward in replay memory
        for i in range(len(state)):
            ind = i
            if self.share_buffer:
                ind = 0
            self.memory[i].add(state[i, ...], action[i, ...], reward[i],
                               next_state[i, ...], done[i])

        # Learn, if enough samples are available in memory
        c_i = random.randint(0, len(self.agents) - 1)
        for i, agent in enumerate(self.agents):
            update_critic = True
            if self.share_critic and i != c_i:
                update_critic = False

            ind = i
            if self.share_buffer:
                ind = 0
            if len(self.memory[ind]) < self.batch_size:
                continue

            experiences = self.memory[ind].sample()
            self.learn(agent, experiences, self.gamma, update_critic)

    def act(self, state, add_noise=True):
        "Returns actions for given state as per current policy"
        state = torch.from_numpy(state).float().to(device)
        action_list = []
        for i, agent in enumerate(self.agents):
            action_list.append(agent.act(state[[i], ...]))
        return np.concatenate(action_list, axis=0)

    def load(self, filename, map_location=None):
        "Load weights for actor and critic"
        weights = torch.load(filename, map_location=map_location)
        for i, agent in enumerate(self.agents):
            agent.load_state_dict(weights[f'actor_{i}'])
            if self.share_critic:
                self.critic_local.load_state_dict(weights['critic'])
                continue
            agent.load_state_dict(weights[f'critic_{i}'])

    def reset(self):
        self.noise.reset()

    def save(self, filename='checkpoint.pth'):
        "Serialize actor and critic weights"
        checkpoint = {}
        for i, agent in enumerate(self.agents):
            checkpoint[f'actor_{i}'] = agent.actor_local.state_dict()
            if not self.share_critic:
                checkpoint[f'critic_{i}'] = agent.critic_local.state_dict()
        if self.share_critic:
            checkpoint[f'critic'] = self.critic_local.state_dict()
        torch.save(checkpoint, filename)

    def learn(self, agent, experiences, gamma, update_critic=True):
        """Update policy and value parameters with a batch of experiences

        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done)
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        critic_target = agent.critic_target
        critic_local = agent.critic_local
        critic_optimizer = agent.critic_optimizer
        if self.share_critic:
            critic_target = self.critic_target
            critic_local = self.critic_local
            critic_optimizer = self.critic_optimizer

        # Update critic
        # Get predicted next-state actions and Q values from target models
        actions_next = agent.actor_target(next_states)
        Q_targets_next = critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        if update_critic:
            critic_optimizer.zero_grad()
            critic_loss.backward()
            critic_optimizer.step()

        # Update actor
        # Compute actor loss
        actions_pred = agent.actor_local(states)
        actor_loss = -critic_local(states, actions_pred).mean()
        # Minimize the loss
        agent.actor_optimizer.zero_grad()
        actor_loss.backward()
        agent.actor_optimizer.step()

        # Update target networks
        soft_update(critic_local, critic_target, self.tau)
        soft_update(agent.actor_local, agent.actor_target, self.tau)
예제 #23
0
class Agent(object):
    def __init__(
        self,
        a_dim,
        s_dim,
        a_bound,
    ):
        self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
        #self.sess = tf.Session()
        self.P_online = Actor(s_dim, a_dim)
        self.P_target = Actor(s_dim, a_dim)
        self.P_target.load_state_dict(self.P_online.state_dict())
        self.Q_online = Critic(s_dim, a_dim)
        self.Q_target = Critic(s_dim, a_dim)
        self.Q_target.load_state_dict(self.Q_online.state_dict())
        self.q_optimizer = torch.optim.Adam(self.Q_online.parameters(),
                                            lr=LR_C)
        self.p_optimizer = torch.optim.Adam(self.P_online.parameters(),
                                            lr=LR_A)
        self.loss_td = nn.MSELoss()
        self.replay_buffer = ReplayBuffer()
        self.batch_size = 32

        self.discrete = False
        self.ep_step = 0
        # noise
        self.noise = Noise(DELTA, SIGMA, OU_A, OU_MU)
        # Initialize noise
        self.ou_level = 0.
        self.action_low = -2
        self.action_high = 2

    def act(self, state, test=False):
        if not test:
            with torch.no_grad():
                # boring type casting
                state = ((
                    torch.from_numpy(state)).unsqueeze(0)).float().to('cpu')
                action = self.P_online(state)  # continuous output
                a = action.data.cpu().numpy()
                if self.discrete:
                    action = np.argmax(a)
                    return a, action
                else:
                    if self.ep_step < 200:
                        self.ou_level = self.noise.ornstein_uhlenbeck_level(
                            self.ou_level)
                    action = np.clip(a + self.ou_level, self.action_low,
                                     self.action_high)
                    return (torch.from_numpy(action)).view(-1)

    def collect_data(self, state, action, reward, next_state, done):
        self.replay_buffer.push(
            torch.from_numpy(state).float().unsqueeze(0),
            torch.from_numpy(action).float().unsqueeze(0),
            torch.tensor([reward]).float().unsqueeze(0),
            torch.from_numpy(next_state).float().unsqueeze(0),
            torch.tensor([done]).float().unsqueeze(0))

    def clear_data(self):
        raise NotImplementedError("Circular Queue don't need this function")

    def update(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size=self.batch_size, device='cpu')

        #===============================Critic Update===============================
        with torch.no_grad():
            target = rewards + GAMMA * (1 - dones) * self.Q_target(
                (next_states, self.P_target(next_states)))
        Q = self.Q_online((states, actions))
        td_error = self.loss_td(target, Q)
        self.q_optimizer.zero_grad()
        td_error.backward()
        self.q_optimizer.step()

        #===============================Actor Update===============================
        q = self.Q_online((states, self.P_online(states)))
        loss_a = -torch.mean(q)
        self.p_optimizer.zero_grad()
        loss_a.backward()
        self.p_optimizer.step()

        #===============================Target Update===============================
        soft_update(self.Q_target, self.Q_online, tau=1e-2)
        soft_update(self.P_target, self.P_online, tau=1e-2)
class Multi_Agents():
    """
    Implements interactions and learning on environments for a set of agents
    """
    def __init__(self, agents_count, state_size, action_size, random_seed,
                 buffer_size, batch_size, gamma, fc1_units, fc2_units, noise,
                 lr_actor, lr_critic):
        """Initialize a Multi_Agent.

        Params
        ======
            agents_count (int): the number of agents
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            buffer_size(int): replay buffer size
            gamma(float): discount factor
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
            noise(Object): The noise applied to the actions selection
            lr_actor(float) : learning rates of the actor
            lr_critic(float) : learning rates of the critic
        """

        self.agents_count = agents_count
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.gamma = gamma
        self.batch_size = batch_size

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed,
                                 fc1_units, fc2_units).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed,
                                  fc1_units, fc2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed,
                                   fc1_units, fc2_units).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed,
                                    fc1_units, fc2_units).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=WEIGHT_DECAY)

        # after reading implementating of ShangtongZhang as suggested in the course,
        # It seems relevant to initialize the weights of the target networks
        # with the same values as the local network :
        self.actor_target.load_state_dict(self.actor_local.state_dict())
        self.critic_target.load_state_dict(self.critic_local.state_dict())

        # Noise process
        self.noise = noise

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed)

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        for a in range(self.agents_count):
            # save for each agent
            self.memory.add(states[a], actions[a], rewards[a], next_states[a],
                            dones[a])

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

    def act(self, states, add_noise=True):
        """Returns actions for each given state of each agent as per current policy."""

        states = torch.from_numpy(states).float().to(device)
        actions = np.empty([self.agents_count, self.action_size])

        self.actor_local.eval()
        with torch.no_grad():
            for a in range(self.agents_count):
                actions[a] = self.actor_local(states[a]).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # as suggested in the "Benchmak implementation" section of the course"
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
예제 #25
0
class DDPG_Agent:
    def __init__(self, state_size, action_size, seed, index=0, num_agents=2):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int):   Dimension of each state
            action_size (int):  Dimension of each action
            seed (int):         Random seed
            index (int):        Index assigned to the agent
            num_agents (int):   Number of agents in the environment
        """

        self.state_size = state_size  # State size
        self.action_size = action_size  # Action size
        self.seed = torch.manual_seed(seed)  # Random seed
        self.index = index  # Index of this agent, not used at the moment
        self.tau = TAU  # Parameter for soft weight update
        self.num_updates = N_UPDATES  # Number of updates to perform when updating
        self.num_agents = num_agents  # Number of agents in the environment
        self.tstep = 0  # Simulation step (modulo (%) UPDATE_EVERY)
        self.gamma = GAMMA  # Gamma for the reward discount
        self.alpha = ALPHA  # PER: toggle prioritization (0..1)

        # Set up actor and critic networks
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Ornstein-Uhlenbeck noise
        self.noise = OUNoise((1, action_size), seed)

        # Replay buffer
        self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE,
                                              BATCH_SIZE, seed, self.alpha)

    # act and act_targets similar to exercises and MADDPG Lab
    def act(self, states, noise=1.0):
        """Returns actions for given state as per current policy.
    
        Params
        ======
            state [n_agents, state_size]: current state
            noise (float):    control whether or not noise is added
        """
        # Uncomment if state is numpy array instead of tensor
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((1, self.action_size))

        # Put model into evaluation mode
        self.actor_local.eval()

        # Get actions for current state, transformed from probabilities
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()

        # Put actor back into training mode
        self.actor_local.train()

        # Ornstein-Uhlenbeck noise addition
        actions += noise * self.noise.sample()

        #  Transform probability into valid action ranges
        return np.clip(actions, -1, 1)

    def step(self, states, actions, rewards, next_states, dones, beta):
        """Save experience in replay memory, use random samples from buffer to learn.
        
        PARAMS
        ======
            states:     [n_agents, state_size]  current state
            actions:    [n_agents, action_size] taken action
            rewards:    [n_agents]              earned reward
            next_states:[n_agents, state_size]  next state
            dones:      [n_agents]              Whether episode has finished
            beta:       [0..1]                  PER: toggles correction for importance weights (0 - no corrections, 1 - full correction)
        """
        # ------------------------------------------------------------------
        # Save experience in replay memory - slightly more effort due to Prioritization
        # We need to calculate priorities for the experience tuple.
        # This is in our case (Q_expected - Q_target)**2
        # -----------------------------------------------------------------
        # Set all networks to evaluation mode
        self.actor_target.eval()
        self.critic_target.eval()
        self.critic_local.eval()

        state = torch.from_numpy(states).float().to(device)
        next_state = torch.from_numpy(next_states).float().to(device)
        action = torch.from_numpy(actions).float().to(device)
        #reward = torch.from_numpy(rewards).float().to(device)
        #done = torch.from_numpy(dones).float().to(device)

        with torch.no_grad():
            next_actions = self.actor_target(state)
            own_action = action[:, self.index *
                                self.action_size:(self.index + 1) *
                                self.action_size]
            if self.index:
                # Agent 1
                next_actions_agent = torch.cat((own_action, next_actions),
                                               dim=1)
            else:
                # Agent 0: flipped order
                next_actions_agent = torch.cat((next_actions, own_action),
                                               dim=1)

            # Predicted Q value from Critic target network
            Q_targets_next = self.critic_target(next_state,
                                                next_actions_agent).float()
            #print(f"Type Q_t_n: {type(Q_targets_next)}")
            #print(f"Type gamma: {type(self.gamma)}")
            #print(f"Type dones: {type(dones)}")
            Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
            Q_expected = self.critic_local(state, action)

        # Use error between Q_expected and Q_targets as priority in buffer
        error = (Q_expected - Q_targets)**2
        self.memory.add(state, action, rewards, next_state, dones, error)

        # Set all networks back to training mode
        self.actor_target.train()
        self.critic_target.train()
        self.critic_local.train()

        # ------------------------------------------------------------------
        # Usual learning procedure
        # -----------------------------------------------------------------
        # Learn every UPDATE_EVERY time steps
        self.tstep = (self.tstep + 1) % UPDATE_EVERY

        # If UPDATE_EVERY and enough samples are available in memory, get random subset and learn
        if self.tstep == 0 and len(self.memory) > BATCH_SIZE:
            for _ in range(self.num_updates):
                experiences = self.memory.sample(beta)
                self.learn(experiences)

    def reset(self):
        """Reset the noise parameter of the agent."""
        self.noise.reset()

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples. 
        Update according to 
            Q_targets = r + gamma * critic_target(next_state, actor_target(next_state))
        
        According to the lessons: 
            actor_target  (state)           gives   action
            critic_target (state, action)   gives   Q-value

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of 
                    states          states visited
                    actions         actions taken by all agents
                    rewards         rewards received
                    next states     all next states
                    dones           whether or not a final state is reached 
                    weights         weights of the experiences
                    indices         indices of the experiences            
        """

        # Load experiences from sample
        states, actions, rewards, next_states, dones, weights_cur, indices = experiences

        # ------------------- update critic ------------------- #

        # Get next actions via actor network
        next_actions = self.actor_target(next_states)

        # Stack action together with action of the agent
        own_actions = actions[:,
                              self.index * self.action_size:(self.index + 1) *
                              self.action_size]
        if self.index:
            # Agent 1
            next_actions_agent = torch.cat((own_actions, next_actions), dim=1)
        else:
            # Agent 0: flipped order
            next_actions_agent = torch.cat((next_actions, own_actions), dim=1)

        # Predicted Q value from Critic target network
        Q_targets_next = self.critic_target(next_states, next_actions_agent)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        Q_expected = self.critic_local(states, actions)

        # Update priorities in ReplayBuffer
        loss = (Q_expected - Q_targets).pow(2).reshape(
            weights_cur.shape) * weights_cur
        self.memory.update(indices, loss.data.cpu().numpy())

        # Compute critic loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Clip gradients
        #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), GRAD_CLIPPING)
        self.critic_optimizer.step()

        # ------------------- update actor ------------------- #
        actions_expected = self.actor_local(states)

        # Stack action together with action of the agent
        own_actions = actions[:,
                              self.index * self.action_size:(self.index + 1) *
                              self.action_size]
        if self.index:
            # Agent 1:
            actions_expected_agent = torch.cat((own_actions, actions_expected),
                                               dim=1)
        else:
            # Agent 0: flipped order
            actions_expected_agent = torch.cat((actions_expected, own_actions),
                                               dim=1)

        # Compute actor loss based on expectation from actions_expected
        actor_loss = -self.critic_local(states, actions_expected_agent).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks
        self.target_soft_update(self.critic_local, self.critic_target)
        self.target_soft_update(self.actor_local, self.actor_target)

    def target_soft_update(self, local_model, target_model):
        """Soft update model parameters for actor and critic of all MADDPG agents.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        """

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)

    def save(self, filename):
        """Saves the agent to the local workplace

        Params
        ======
            filename (string): where to save the weights
        """

        checkpoint = {
            'input_size':
            self.state_size,
            'output_size':
            self.action_size,
            'actor_hidden_layers': [
                each.out_features for each in self.actor_local.hidden_layers
                if each._get_name() != 'BatchNorm1d'
            ],
            'actor_state_dict':
            self.actor_local.state_dict(),
            'critic_hidden_layers': [
                each.out_features for each in self.critic_local.hidden_layers
                if each._get_name() != 'BatchNorm1d'
            ],
            'critic_state_dict':
            self.critic_local.state_dict()
        }

        torch.save(checkpoint, filename)

    def load_weights(self, filename):
        """ Load weights to update agent's actor and critic networks.
        Expected is a format like the one produced by self.save()

        Params
        ======
            filename (string): where to load data from. 
        """
        checkpoint = torch.load(filename)
        if not checkpoint['input_size'] == self.state_size:
            print(
                f"Error when loading weights from checkpoint {filename}: input size {checkpoint['input_size']} doesn't match state size of agent {self.state_size}"
            )
            return None
        if not checkpoint['output_size'] == self.action_size:
            print(
                f"Error when loading weights from checkpoint {filename}: output size {checkpoint['output_size']} doesn't match action space size of agent {self.action_size}"
            )
            return None
        my_actor_hidden_layers = [
            each.out_features for each in self.actor_local.hidden_layers
            if each._get_name() != 'BatchNorm1d'
        ]
        if not checkpoint['actor_hidden_layers'] == my_actor_hidden_layers:
            print(
                f"Error when loading weights from checkpoint {filename}: actor hidden layers {checkpoint['actor_hidden_layers']} don't match agent's actor hidden layers {my_actor_hidden_layers}"
            )
            return None
        my_critic_hidden_layers = [
            each.out_features for each in self.critic_local.hidden_layers
            if each._get_name() != 'BatchNorm1d'
        ]
        if not checkpoint['critic_hidden_layers'] == my_critic_hidden_layers:
            print(
                f"Error when loading weights from checkpoint {filename}: critic hidden layers {checkpoint['critic_hidden_layers']} don't match agent's critic hidden layers {my_critic_hidden_layers}"
            )
            return None
        self.actor_local.load_state_dict(checkpoint['actor_state_dict'])
        self.critic_local.load_state_dict(checkpoint['critic_state_dict'])
예제 #26
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete
        self.pic = args.pic
        self.writer = writer
        self.select_time = 0        
        if self.pic:
            self.nb_status = args.pic_status
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'use_bn':args.bn,
            'init_method':args.init_method
        }
        if args.pic:
            self.cnn = CNN(1, args.pic_status)
            self.cnn_target = CNN(1, args.pic_status)
            self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        if args.pic:
            hard_update(self.cnn_target, self.cnn)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def normalize(self, pic):
        pic = pic.swapaxes(0, 2).swapaxes(1, 2)
        return pic

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        if self.pic:
            state_batch = np.array([self.normalize(x) for x in state_batch])
            state_batch = to_tensor(state_batch, volatile=True)
            state_batch = self.cnn(state_batch)
            next_state_batch = np.array([self.normalize(x) for x in next_state_batch])
            next_state_batch = to_tensor(next_state_batch, volatile=True)
            next_state_batch = self.cnn_target(next_state_batch)
            next_q_values = self.critic_target([
                next_state_batch,
                self.actor_target(next_state_batch)
            ])
        else:
            next_q_values = self.critic_target([
                to_tensor(next_state_batch, volatile=True),
                self.actor_target(to_tensor(next_state_batch, volatile=True)),
            ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            q_batch = self.critic([state_batch, to_tensor(action_batch)])
        else:
            q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()
        if self.pic: self.cnn_optim.step()

        self.actor.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            policy_loss = -self.critic([
                state_batch,
                self.actor(state_batch)
            ])
        else:
            policy_loss = -self.critic([
                to_tensor(state_batch),
                self.actor(to_tensor(state_batch))
            ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time)

        self.actor_optim.step()
        if self.pic: self.cnn_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        if self.pic:
            soft_update(self.cnn_target, self.cnn, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()
        if(self.pic):
            self.cnn.eval()
            self.cnn_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()
        if(self.pic):
            self.cnn.train()
            self.cnn_target.train()

    def cuda(self):
        self.cnn.cuda()
        self.cnn_target.cuda()
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self, fix=False):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        if self.discrete and fix == False:
            action = action.argmax()
#        if self.pic:
#            action = np.concatenate((softmax(action[:16]), softmax(action[16:])))
        return action
        
    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        self.eval()
        if self.pic:
            s_t = self.normalize(s_t)
            s_t = self.cnn(to_tensor(np.array([s_t])))
        if self.pic:
            action = to_numpy(
                self.actor_target(s_t)
            ).squeeze(0)
        else:
            action = to_numpy(
                self.actor(to_tensor(np.array([s_t])))
            ).squeeze(0)
        self.train()
        noise_level = noise_level * max(self.epsilon, 0)

        if np.random.uniform(0, 1) < noise_level:
            action = self.random_action(fix=True) # episilon greedy            

        if decay_epsilon:
            self.epsilon -= self.depsilon
        self.a_t = action
        
        if return_fix:
            return action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):        
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            self.cnn.cpu()
            self.actor.cpu()
            self.critic.cpu()
        torch.save(
            self.actor.state_dict(),
            '{}/actor{}.pkl'.format(output, num)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            self.cnn.cuda()
            self.actor.cuda()
            self.critic.cuda()
예제 #27
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate)

    writer = SummaryWriter(comment="-ppo_iter-" + str(args.max_iter_num))
    
    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0    

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size: 
            state = env.reset()
            score = 0

            state = running_state(state)
            
            for _ in range(10000): 
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train()
        train_model(actor, critic, memory, actor_optim, critic_optim, args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'z_filter_n':running_state.rs.n,
                'z_filter_m': running_state.rs.mean,
                'z_filter_s': running_state.rs.sum_square,
                'args': args,
                'score': score_avg
            }, filename=ckpt_path)