Exemplo n.º 1
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, agent_count, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
        """
        self.state_size = state_size
        self.action_size = action_size

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(agent_count * state_size,
                                   agent_count * action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(agent_count * state_size,
                                    agent_count * action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

    def soft_update(self):
        self.soft_update_network(self.critic_local, self.critic_target, TAU)
        self.soft_update_network(self.actor_local, self.actor_target, TAU)

    def soft_update_network(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save(self, name):
        torch.save(self.actor_local.state_dict(), name + '_actor.pth')
        torch.save(self.critic_local.state_dict(), name + '_critic.pth')

    def load(self, name):
        self.actor_local.load_state_dict(torch.load(name + '_actor.pth'))
        self.critic_local.load_state_dict(torch.load(name + '_critic.pth'))
Exemplo n.º 2
0
def train_tsp(args):

    # Goals from paper:
    # TSP20, 3.97
    # TSP50, 6.08
    # TSP100, 8.44

    from tasks import tsp
    from tasks.tsp import TSPDataset

    STATIC_SIZE = 2 # (x, y)
    DYNAMIC_SIZE = 1 # dummy for compatibility

    train_data = TSPDataset(args.num_nodes, args.train_size, args.seed)
    valid_data = TSPDataset(args.num_nodes, args.valid_size, args.seed + 1)

    update_fn = None

    actor = Actor(STATIC_SIZE,
                    DYNAMIC_SIZE,
                    args.hidden_size,
                    update_fn,
                    tsp.update_mask,
                    args.num_layers,
                    args.dropout).to(device)

    critic = Critic(STATIC_SIZE, DYNAMIC_SIZE, args.hidden_size).to(device)

    kwargs = vars(args)
    kwargs['train_data'] = train_data
    kwargs['valid_data'] = valid_data
    kwargs['reward_fn'] = tsp.reward
    kwargs['render_fn'] = tsp.render

    if args.checkpoint:
        path = os.path.join(args.checkpoint, 'actor.pt')
        actor.load_state_dict(torch.load(path, device))

        path = os.path.join(args.checkpoint, 'critic.pt')
        critic.load_state_dict(torch.load(path, device))

    if not args.test:
        train(actor, critic, **kwargs)

    test_data = TSPDataset(args.num_nodes, args.train_size, args.seed + 2)

    test_dir = 'test'
    test_loader = DataLoader(test_data, args.batch_size, False, num_workers=0)
    out = validate(test_loader, actor, tsp.reward, tsp.render, test_dir, num_plot=5)

    print('Average tour length: ', out)
Exemplo n.º 3
0
def init_model(env, model_args, ckpt=None):
    # get input/output size and range
    s_dim = env.get_state_size()
    a_dim = env.get_action_size()
    a_min = env.a_min
    a_max = env.a_max
    a_noise = model_args["noise"] * np.ones(a_dim)

    # get reference memory for FFC
    ref_mem = env._mocap.get_ref_mem()
    if not model_args["with_ffc"]:
        ref_mem.fill(0)
    ref_mem = ref_mem[:, 1:]  # no phase velocity

    # automatically use gpu
    if use_gpu:
        torch.set_default_tensor_type('torch.cuda.FloatTensor')

    from model import Normalizer, Actor, Critic
    GAMMA = file_args["train_args"]["gamma"]
    non_norm = [0]  #FMD0
    s_norm = Normalizer(s_dim, non_norm)
    actor = Actor(s_dim, a_dim, a_min, a_max, a_noise, ref_mem.shape[0])
    critic = Critic(s_dim, 0, 1 / (1 - GAMMA))

    actor.set_reference(ref_mem)
    actor.ref_mem.requires_grad = False

    if (args.ckpt is not None):
        try:
            checkpoint = torch.load(args.ckpt)
            actor.load_state_dict(checkpoint["actor"])
            critic.load_state_dict(checkpoint["critic"])
            s_norm.load_state_dict(checkpoint["s_norm"])
            print("load from %s" % args.ckpt)

        except:
            print("fail to load from %s" % args.ckpt)
            assert (False)

    return s_norm, actor, critic
Exemplo n.º 4
0
def main():
    expert_demo = pickle.load(open('./Ree1_expert.p', "rb"))
    # Ree1 : action 1
    # Ree2 : action 100
    # Ree3 : action 50
    # Ree4 : action 10
    # Ree5 : action 4
    # Ree6 : action 0.5

    # print('expert_demo_shape : ', np.array(expert_demo).shape)
    expert_x = int(expert_demo[1][0])
    expert_y = int(expert_demo[1][1])
    env = Env(expert_x, expert_y)
    # env = Env(0,0)

    # env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = 2
    num_actions = 8
    running_state = ZFilter((num_inputs, ), clip=5)

    print('state size:', num_inputs)
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    discrim = Discriminator(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(),
                              lr=args.learning_rate,
                              weight_decay=args.l2_rate)
    discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate)

    # load demonstrations
    # expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb"))

    demonstrations = np.array(expert_demo[0])

    # print("demonstrations.shape", demonstrations.shape)

    writer = SummaryWriter(args.logdir)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model',
                                       str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        discrim.load_state_dict(ckpt['discrim'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    episodes = 0
    train_discrim_flag = True

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size:
            state = env.reset()
            score = 0

            state = running_state(state)

            for _ in range(1000):
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action2 = np.argmax(get_action(mu, std)[0])
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action2)
                # next_state, reward, done, _ = env.step(action)
                irl_reward = get_reward(discrim, state, action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, irl_reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break

            episodes += 1
            scores.append(score)

        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes,
                                                       score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train(), discrim.train()
        if train_discrim_flag:
            expert_acc, learner_acc = train_discrim(discrim, memory,
                                                    discrim_optim,
                                                    demonstrations, args)
            print("Expert: %.2f%% | Learner: %.2f%%" %
                  (expert_acc * 100, learner_acc * 100))

            temp_learner.append(learner_acc * 100)
            temp_expert.append(expert_acc * 100)

            if ((expert_acc > args.suspend_accu_exp
                 and learner_acc > args.suspend_accu_gen and iter % 55 == 0)
                    or iter % 50 == 0):
                # train_discrim_flag = False
                plt.plot(temp_learner, label='learner')
                plt.plot(temp_expert, label='expert')
                plt.xlabel('Episode')
                plt.ylabel('Accuracy')
                plt.xticks([])
                plt.legend()
                plt.savefig('accuracy{}.png'.format(iter))
                # plt.show()

                model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail'
                ckpt_path = os.path.join(model_path,
                                         'ckpt_' + str(score_avg) + '.pth.tar')

                print("check path", ckpt_path)
                save_checkpoint(
                    {
                        'actor': actor.state_dict(),
                        'critic': critic.state_dict(),
                        'discrim': discrim.state_dict(),
                        'z_filter_n': running_state.rs.n,
                        'z_filter_m': running_state.rs.mean,
                        'z_filter_s': running_state.rs.sum_square,
                        'args': args,
                        'score': score_avg
                    },
                    filename=ckpt_path)

        train_actor_critic(actor, critic, memory, actor_optim, critic_optim,
                           args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(), 'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail'
            ckpt_path = os.path.join(model_path,
                                     'ckpt_' + str(score_avg) + '.pth.tar')

            save_checkpoint(
                {
                    'actor': actor.state_dict(),
                    'critic': critic.state_dict(),
                    'discrim': discrim.state_dict(),
                    'z_filter_n': running_state.rs.n,
                    'z_filter_m': running_state.rs.mean,
                    'z_filter_s': running_state.rs.sum_square,
                    'args': args,
                    'score': score_avg
                },
                filename=ckpt_path)
    plt.plot(temp_learner)
    plt.plot(temp_expert)
    plt.xlabel('Episode')
    plt.ylabel('Accuracy')
    plt.xticks([])
    plt.savefig('accuracy.png')
Exemplo n.º 5
0
Arquivo: ddpg.py Projeto: YuanyeMa/RL
class DDPGAgent:
    def __init__(self,
                 plot=True,
                 seed=1,
                 env: gym.Env = None,
                 batch_size=128,
                 learning_rate_actor=0.001,
                 learning_rate_critic=0.001,
                 weight_decay=0.01,
                 gamma=0.999):

        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)

        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.batch_size = batch_size
        self.learning_rate_actor = learning_rate_actor
        self.learning_rate_critic = learning_rate_critic
        self.weight_decay = weight_decay
        self.gamma = gamma
        self.tau = 0.001

        self._to_tensor = util.to_tensor
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')

        self.actor = Actor(self.state_dim, self.action_dim).to(self.device)
        self.target_actor = Actor(self.state_dim,
                                  self.action_dim).to(self.device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                self.learning_rate_actor,
                                                weight_decay=self.weight_decay)

        self.critic = Critic(self.state_dim, self.action_dim).to(self.device)
        self.target_critic = Critic(self.state_dim,
                                    self.action_dim).to(self.device)
        self.critic_optimizer = torch.optim.Adam(
            self.critic.parameters(),
            self.learning_rate_critic,
            weight_decay=self.weight_decay)

        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)
        self.t = 0

    def _learn_from_memory(self, memory):
        ''' 从记忆学习,更新两个网络的参数
        '''
        # 随机获取记忆里的Transition
        trans_pieces = memory.sample(self.batch_size)
        s0 = np.vstack([x.state for x in trans_pieces])
        a0 = np.vstack([x.action for x in trans_pieces])
        r1 = np.vstack([x.reward for x in trans_pieces])
        s1 = np.vstack([x.next_state for x in trans_pieces])
        terminal_batch = np.vstack([x.is_done for x in trans_pieces])

        # 优化评论家网络参数
        s1 = self._to_tensor(s1, device=self.device)
        s0 = self._to_tensor(s0, device=self.device)

        next_q_values = self.target_critic.forward(
            state=s1, action=self.target_actor.forward(s1)).detach()
        target_q_batch = self._to_tensor(r1, device=self.device) + \
            self.gamma*self._to_tensor(terminal_batch.astype(np.float), device=self.device)*next_q_values
        q_batch = self.critic.forward(s0,
                                      self._to_tensor(a0, device=self.device))

        # 计算critic的loss 更新critic网络参数
        loss_critic = F.mse_loss(q_batch, target_q_batch)
        #self.critic_optimizer.zero_grad()
        self.critic.zero_grad()
        loss_critic.backward()
        self.critic_optimizer.step()

        # 反向传播,以某状态的价值估计为策略目标函数
        loss_actor = -self.critic.forward(s0, self.actor.forward(s0))  # Q的梯度上升
        loss_actor = loss_actor.mean()
        self.actor.zero_grad()
        #self.actor_optimizer.zero_grad()
        loss_actor.backward()
        self.actor_optimizer.step()

        # 软更新参数
        soft_update(self.target_actor, self.actor, self.tau)
        soft_update(self.target_critic, self.critic, self.tau)
        return (loss_critic.item(), loss_actor.item())

    def learning(self, memory):
        self.actor.train()
        return self._learn_from_memory(memory)

    def save_models(self, episode_count):
        torch.save(self.target_actor.state_dict(),
                   './Models/' + str(episode_count) + '_actor.pt')
        torch.save(self.target_critic.state_dict(),
                   './Models/' + str(episode_count) + '_critic.pt')

    def load_models(self, episode):
        self.actor.load_state_dict(
            torch.load('./Models/' + str(episode) + '_actor.pt'))
        self.critic.load_state_dict(
            torch.load('./Models/' + str(episode) + '_critic.pt'))
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)
        print('Models loaded successfully')
Exemplo n.º 6
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 seed,
                 fc1=400,
                 fc2=300,
                 update_times=10):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.num_agents = num_agents
        self.update_times = update_times

        self.noise = []
        for i in range(num_agents):
            self.noise.append(
                rm.OrnsteinUhlenbeckProcess(size=(action_size, ),
                                            std=LinearSchedule(0.2)))

        # critic local and target network (Q-Learning)
        self.critic_local = Critic(state_size, action_size, fc1, fc2,
                                   seed).to(device)

        self.critic_target = Critic(state_size, action_size, fc1, fc2,
                                    seed).to(device)
        self.critic_target.load_state_dict(self.critic_local.state_dict())

        # actor local and target network (Policy gradient)
        self.actor_local = Actor(state_size, action_size, fc1, fc2,
                                 seed).to(device)
        self.actor_target = Actor(state_size, action_size, fc1, fc2,
                                  seed).to(device)
        self.actor_target.load_state_dict(self.actor_local.state_dict())

        # optimizer for critic and actor network
        self.optimizer_critic = optim.Adam(self.critic_local.parameters(),
                                           lr=CRITIC_LR)
        self.optimizer_actor = optim.Adam(self.actor_local.parameters(),
                                          lr=ACTOR_LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.a_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        for i in range(self.num_agents):
            self.memory.add(state[i], action[i], reward[i], next_state[i],
                            done[i])

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY

        if self.t_step == 0:

            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                for i in range(self.update_times):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def act(self, state, training=True):
        """Returns continous actions values for all action for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
        """

        state = torch.from_numpy(state).float().detach().to(device)
        #print(state.shape,"act")

        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(state)
        self.actor_local.train()

        noise = []
        for i in range(self.num_agents):
            noise.append(self.noise[i].sample())

        return np.clip(actions.cpu().data.numpy() + np.array(noise), -1, 1)

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        next_actions = self.actor_target(next_states)
        with torch.no_grad():
            Q_target_next = self.critic_target(next_states, next_actions)
        Q_targets = rewards + (gamma * Q_target_next * (1 - dones))

        Q_expected = self.critic_local(states, actions)

        #critic loss
        loss = F.mse_loss(Q_expected, Q_targets.detach())

        self.optimizer_critic.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.optimizer_critic.step()

        #actor loss

        action_pr = self.actor_local(states)
        p_loss = -self.critic_local(states, action_pr).mean()

        self.optimizer_actor.zero_grad()
        p_loss.backward()

        self.optimizer_actor.step()

        # ------------------- update target network ------------------- #

        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def reset_random(self):
        for i in range(self.num_agents):
            self.noise[i].reset_states()
Exemplo n.º 7
0
class Agent():
    """ Interacts with and learns from the environment """
    def __init__(self, state_size, action_size, num_agents, seed):
        """
        Initialize an Agent object

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents to run
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)

        # Actor network (with target network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network (with target network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        self.steps_counter = 0
        self.train_counter = 0

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for i in range(self.num_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def add(self, state, action, reward, next_state, done):
        """ Save experience to replay memory """
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

    def learn_from_buffer(self, train_counter):
        for i in range(train_counter):
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, states, add_noise=True):
        """ Returns actions for given state as per current policy """
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """ 
        Update policy/value parameters using batch of experience tuples

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s,a,r,s',done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ############# Update Critic #############
        # Get predicted next-state actions and Q-values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i) ??
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))  # ??

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        ############# Update Actor #############
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        ############# Update Target Networks #############
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update model parameters
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (pyTorch model): where weights come from
            target_model (pyTorch model): where weights will go
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def set_target_model(self, actor, critic):  # ??
        self.actor_target = actor
        self.critic_target = critic

    def get_target_model(self):
        return self.actor_target, self.critic_target

    def load_weights(self, actor_path, critic_path):
        # Actor
        self.actor_local.load_state_dict(torch.load(actor_path))
        self.actor_target.load_state_dict(torch.load(actor_path))

        # Critic
        self.critic_local.load_state_dict(torch.load(critic_path))
        self.critic_target.load_state_dict(torch.load(critic_path))
Exemplo n.º 8
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args):
        self.num_actor = 3

        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'use_bn': args.bn
        }
        self.actors = [Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor)]
        self.actor_targets = [Actor(self.nb_status, self.nb_actions) for _ in
                              range(self.num_actor)]
        self.actor_optims = [Adam(self.actors[i].parameters(), lr=args.prate) for i in range(self.num_actor)]

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        for i in range(self.num_actor):
            hard_update(self.actor_targets[i], self.actors[i])  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = rpm(args.rmsize)  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def update_policy(self, train_actor=True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
        next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        next_q_values = 0
        for i in range(self.num_actor):
            next_q_values = next_q_values + self.critic_target([
                to_tensor(next_state_batch, volatile=True),
                self.actor_targets[i](to_tensor(next_state_batch, volatile=True)),
            ])
        # print('batch of picture is ok')
        next_q_values = next_q_values / self.num_actor
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
                         self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()
        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        sum_policy_loss = 0
        for i in range(self.num_actor):
            self.actors[i].zero_grad()

            policy_loss = -self.critic([
                to_tensor(state_batch),
                self.actors[i](to_tensor(state_batch))
            ])

            policy_loss = policy_loss.mean()
            policy_loss.backward()
            if train_actor:
                self.actor_optims[i].step()
            sum_policy_loss += policy_loss

            # Target update
            soft_update(self.actor_targets[i], self.actors[i], self.tau)

        soft_update(self.critic_target, self.critic, self.tau)

        return -sum_policy_loss / self.num_actor, value_loss

    def cuda(self):
        for i in range(self.num_actor):
            self.actors[i].cuda()
            self.actor_targets[i].cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        actions = []
        status = []
        tot_score = []
        for i in range(self.num_actor):
            action = to_numpy(self.actors[i](to_tensor(np.array([s_t]), volatile=True))).squeeze(0)
            noise_level = noise_level * max(self.epsilon, 0)
            action = action + self.random_process.sample() * noise_level
            status.append(s_t)
            actions.append(action)
            tot_score.append(0.)

        scores = self.critic([to_tensor(np.array(status), volatile=True), to_tensor(np.array(actions), volatile=True)])
        for j in range(self.num_actor):
            tot_score[j] += scores.data[j][0]
        best = np.array(tot_score).argmax()

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = actions[best]
        return actions[best]

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=0):
        if output is None: return
        for i in range(self.num_actor):
            actor = self.actors[i]
            actor_target = self.actor_targets[i]
            actor.load_state_dict(
                torch.load('{}/actor{}_{}.pkl'.format(output, num, i))
            )
            actor_target.load_state_dict(
                torch.load('{}/actor{}_{}.pkl'.format(output, num, i))
            )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            for i in range(self.num_actor):
                self.actors[i].cpu()
            self.critic.cpu()
        for i in range(self.num_actor):
            torch.save(
                self.actors[i].state_dict(),
                '{}/actor{}_{}.pkl'.format(output, num, i)
            )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            for i in range(self.num_actor):
                self.actors[i].cuda()
            self.critic.cuda()
Exemplo n.º 9
0
class Agent():
    "Single agent no learning algorithm"

    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0):
        """Initialize an Agent object

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            lr_actor (float) : learning rate actor network
            lr_critic (float) : learning rate critic network
            weight_decay (float) : weight decay regularizer
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.noise = OUNoise(action_size, random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

    def act(self, state, add_noise=True):
        "Returns actions for given state as per current policy"
        if not isinstance(state, torch.Tensor):
            state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def load(self, filename, map_location=None):
        "Load weights for actor and critic"
        weights = torch.load(filename, map_location=map_location)
        self.actor_local.load_state_dict(weights['actor'])
        if 'critic' in weights:
            self.critic_local.load_state_dict(weights['critic'])

    def reset(self):
        self.noise.reset()

    def save(self, filename='checkpoint.pth'):
        "Serialize actor and critic weights"
        checkpoint = {
            'actor': self.actor_local.state_dict(),
            'critic': self.critic_local.state_dict()
        }
        torch.save(checkpoint, filename)
Exemplo n.º 10
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    discrim = Discriminator(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate) 
    discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate)
    
    # load demonstrations
    expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb"))
    demonstrations = np.array(expert_demo)
    print("demonstrations.shape", demonstrations.shape)
    
    writer = SummaryWriter(args.logdir)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        discrim.load_state_dict(ckpt['discrim'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0
    train_discrim_flag = True

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size: 
            state = env.reset()
            score = 0

            state = running_state(state)
            
            for _ in range(10000): 
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)
                irl_reward = get_reward(discrim, state, action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, irl_reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train(), discrim.train()
        if train_discrim_flag:
            expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args)
            print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100))
            if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen:
                train_discrim_flag = False
        train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'discrim': discrim.state_dict(),
                'z_filter_n':running_state.rs.n,
                'z_filter_m': running_state.rs.mean,
                'z_filter_s': running_state.rs.sum_square,
                'args': args,
                'score': score_avg
            }, filename=ckpt_path)
Exemplo n.º 11
0
class TD3(object):
    def __init__(self, env, writer=None):
        """
        Twin Delayed Deep Deterministic Policy Gradient Algorithm(TD3)
        """
        self.env = env
        self.writer = writer

        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        self.max_action = env.action_space.high[0]

        # Randomly initialize network parameter
        self.actor = Actor(state_dim, action_dim).to('cuda')
        self.critic = Critic(state_dim, action_dim).to('cuda')

        # Initialize target network parameter
        self.target_actor = Actor(state_dim, action_dim).to('cuda')
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic = Critic(state_dim, action_dim).to('cuda')
        self.target_critic.load_state_dict(self.critic.state_dict())

        # Replay memory
        self.memory = ReplayMemory(state_dim, action_dim)

        self.gamma = gamma
        self.tau = tau

        # network parameter optimizer
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_lr,
                                           weight_decay=weight_decay)

    def get_action(self, state, initial_act=False):
        if initial_act:
            return self.env.action_space.sample()
        action = self.actor(torch.from_numpy(state).to('cuda', torch.float))
        action = np.random.normal(0, 0.1) + action.detach().cpu().numpy()
        return np.clip(action, -1, 1)

    def store_transition(self, state, action, state_, reward, done):
        self.memory.store_transition(state, action, state_, reward, done)

    def soft_update(self, target_net, net):
        """Target parameters soft update"""
        for target_param, param in zip(target_net.parameters(),
                                       net.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)

    def update(self, time_step, batch_size=64):
        states, actions, states_, rewards, terminals = self.memory.sample(
            batch_size)

        # Update Critic
        with torch.no_grad():
            noise = (torch.randn_like(actions) * policy_noise).clamp(
                -noise_clip, noise_clip)

            actions_ = (self.target_actor(states_) + noise).clamp(
                -self.max_action, self.max_action)

            target_q1, target_q2 = self.target_critic(states_, actions_)
            y = rewards.unsqueeze(1) + terminals.unsqueeze(
                1) * gamma * torch.min(target_q1, target_q2)
        q1, q2 = self.critic(states, actions)
        critic_loss = F.mse_loss(q1, y) + F.mse_loss(q2, y)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        if self.writer and time_step:
            self.writer.add_scalar("loss/critic", critic_loss.item(),
                                   time_step)

        # Delayed Policy Update
        if time_step % policy_freq == 0:
            # Update Actor
            actor_loss = -1 * self.critic.Q1(states, self.actor(states)).mean()
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
            if self.writer:
                self.writer.add_scalar("loss/actor", actor_loss.item(),
                                       time_step)

            # target parameter soft update
            self.soft_update(self.target_actor,
                             self.actor)  # update target actor network
            self.soft_update(self.target_critic,
                             self.critic)  # update target critic network

    def save_model(self, path='models/'):
        torch.save(self.actor.state_dict(), path + 'actor')
        torch.save(self.critic.state_dict(), path + 'critic')
        torch.save(self.target_actor.state_dict(), path + 'target_actor')
        torch.save(self.target_critic.state_dict(), path + 'target_critic')

    def load_model(self, path='models/'):
        self.actor.load_state_dict(torch.load(path + 'actor'))
        self.critic.load_state_dict(torch.load(path + 'critic'))
        self.target_actor.load_state_dict(torch.load(path + 'target_actor'))
        self.target_critic.load_state_dict(torch.load(path + 'target_critic'))
Exemplo n.º 12
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 memory,
                 random_seed=0,
                 buffer_size=1e5,
                 batch_size=128,
                 gamma=0.99,
                 tau=1e-3,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weight_decay

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic,
                                           weight_decay=self.weight_decay)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        # self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed)
        self.memory = memory

        # Iteration
        self.n_learn = 0
        self.acc_loss_actor = 0
        self.acc_loss_critic = 0

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.n_learn += 1
            self.learn(experiences, self.gamma)

    def act(self, state, add_noise=False):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def load_actor(self, model_path):
        checkpoint = torch.load(model_path)
        self.actor_local.load_state_dict(checkpoint)
        self.actor_target.load_state_dict(checkpoint)

    def load_critic(self, model_path):
        checkpoint = torch.load(model_path)
        self.critic_local.load_state_dict(checkpoint)
        self.critic_target.load_state_dict(checkpoint)

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        self.acc_loss_critic += critic_loss.cpu().data.numpy()

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        self.acc_loss_actor += actor_loss.cpu().data.numpy()
        """
        if self.n_learn % 10 == 0:
            print('\rIter {0}\tActor Loss: {1:.5f}\tCritic Loss: {2:.5f}\t=='.format(self.n_learn, self.acc_loss_actor, self.acc_loss_critic), end="\r")
            self.acc_loss_actor = 0
            self.acc_loss_critic = 0
        """

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 13
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.writer = writer
        self.select_time = 0
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'init_method':args.init_method
        }

        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def update_policy(self, train_actor = True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = nn.MSELoss()(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()

        policy_loss = -self.critic([
            to_tensor(state_batch),
            self.actor(to_tensor(state_batch))
        ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time)

        if train_actor:
            self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        self.eval()
        # print(s_t.shape)
        action = to_numpy(
            self.actor(to_tensor(np.array([s_t])))
        ).squeeze(0)
            
        self.train()
        noise_level = noise_level * max(self.epsilon, 0)
        
        action = action * (1 - noise_level) + (self.random_process.sample() * noise_level)
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):        
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            self.actor.cpu()
            self.critic.cpu()
        torch.save(
            self.actor.state_dict(),
            '{}/actor{}.pkl'.format(output, num)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            self.actor.cuda()
            self.critic.cuda()
Exemplo n.º 14
0
class DDPGTrainer(object):
    def __init__(self):
        self.actor = Actor().to(device)
        self.actor_target = Actor().to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=0.0001)

        self.critic = Critic().to(device)
        self.critic_target = Critic().to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 weight_decay=1e-2)

        self.loss = torch.nn.MSELoss()

    def train(self,
              replay_buffer,
              iterations,
              batch_size=64,
              discount=0.99,
              tau=0.001):

        for it in range(iterations):
            # Sample replay buffer
            smp = replay_buffer.sample(batch_size)
            x, y, u, r, d = smp
            state = torch.FloatTensor(x).to(device)
            action = torch.stack(u)
            next_state = torch.FloatTensor(y).to(device)
            done = torch.FloatTensor(d).to(device)
            reward = torch.FloatTensor(r).to(device)

            # Compute the target Q value
            ac = self.actor_target(next_state)
            ac = torch.cat(ac, dim=1)
            target_Q = self.critic_target(next_state, ac)
            target_Q = reward + (done * discount * target_Q).detach()

            # Get current Q estimate
            current_Q = self.critic(state, action)

            # Compute critic loss
            critic_loss = self.loss(current_Q, target_Q)

            # Optimize the critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward(retain_graph=True)
            self.critic_optimizer.step()

            # Compute actor loss
            actor_loss = -self.critic(state, torch.cat(self.actor(state),
                                                       dim=1)).mean()

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(tau * param.data +
                                        (1 - tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(tau * param.data +
                                        (1 - tau) * target_param.data)
Exemplo n.º 15
0
class Agent(object):
    def __init__(
        self,
        a_dim,
        s_dim,
        a_bound,
    ):
        self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
        #self.sess = tf.Session()
        self.P_online = Actor(s_dim, a_dim)
        self.P_target = Actor(s_dim, a_dim)
        self.P_target.load_state_dict(self.P_online.state_dict())
        self.Q_online = Critic(s_dim, a_dim)
        self.Q_target = Critic(s_dim, a_dim)
        self.Q_target.load_state_dict(self.Q_online.state_dict())
        self.q_optimizer = torch.optim.Adam(self.Q_online.parameters(),
                                            lr=LR_C)
        self.p_optimizer = torch.optim.Adam(self.P_online.parameters(),
                                            lr=LR_A)
        self.loss_td = nn.MSELoss()
        self.replay_buffer = ReplayBuffer()
        self.batch_size = 32

        self.discrete = False
        self.ep_step = 0
        # noise
        self.noise = Noise(DELTA, SIGMA, OU_A, OU_MU)
        # Initialize noise
        self.ou_level = 0.
        self.action_low = -2
        self.action_high = 2

    def act(self, state, test=False):
        if not test:
            with torch.no_grad():
                # boring type casting
                state = ((
                    torch.from_numpy(state)).unsqueeze(0)).float().to('cpu')
                action = self.P_online(state)  # continuous output
                a = action.data.cpu().numpy()
                if self.discrete:
                    action = np.argmax(a)
                    return a, action
                else:
                    if self.ep_step < 200:
                        self.ou_level = self.noise.ornstein_uhlenbeck_level(
                            self.ou_level)
                    action = np.clip(a + self.ou_level, self.action_low,
                                     self.action_high)
                    return (torch.from_numpy(action)).view(-1)

    def collect_data(self, state, action, reward, next_state, done):
        self.replay_buffer.push(
            torch.from_numpy(state).float().unsqueeze(0),
            torch.from_numpy(action).float().unsqueeze(0),
            torch.tensor([reward]).float().unsqueeze(0),
            torch.from_numpy(next_state).float().unsqueeze(0),
            torch.tensor([done]).float().unsqueeze(0))

    def clear_data(self):
        raise NotImplementedError("Circular Queue don't need this function")

    def update(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size=self.batch_size, device='cpu')

        #===============================Critic Update===============================
        with torch.no_grad():
            target = rewards + GAMMA * (1 - dones) * self.Q_target(
                (next_states, self.P_target(next_states)))
        Q = self.Q_online((states, actions))
        td_error = self.loss_td(target, Q)
        self.q_optimizer.zero_grad()
        td_error.backward()
        self.q_optimizer.step()

        #===============================Actor Update===============================
        q = self.Q_online((states, self.P_online(states)))
        loss_a = -torch.mean(q)
        self.p_optimizer.zero_grad()
        loss_a.backward()
        self.p_optimizer.step()

        #===============================Target Update===============================
        soft_update(self.Q_target, self.Q_online, tau=1e-2)
        soft_update(self.P_target, self.P_online, tau=1e-2)
Exemplo n.º 16
0
class DDPG():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed, num_agents,
                 agent_id):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.num_agents = num_agents
        self.agent_id = agent_id
        self.eps = EPS_START
        self.eps_decay = 1 / (EPS_EP_END * EPOCHS)
        self.timestep = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size * 2, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size * 2, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size * 2, action_size * 2,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size * 2, action_size * 2,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.timestep += 1
        priority = (abs(reward) + PRIORITY_EPS)**PRIORITY_ALPHA
        self.memory.add(state, action, reward, next_state, done, priority)

        if self.timestep % UPDATE_EVERY != 0:
            return

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            for i in range(EPOCHS):
                experiences = self.memory.sample(device)
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise):
        """Returns actions for both agents as per current policy, given their respective states."""
        state = torch.from_numpy(state).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            # get action for each agent and concatenate them
            actions = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        # add noise to actions
        if add_noise:
            actions += self.eps * self.noise.sample()
        actions = np.clip(actions, -1, 1)
        return actions

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        # Construct next actions vector relative to the agent
        if self.agent_id == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)
        # Compute Q targets for current states (y_i)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        # Construct action prediction vector relative to each agent
        if self.agent_id == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)
        # Compute actor loss
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # update noise decay parameter
        self.eps -= self.eps_decay
        self.eps = max(self.eps, EPS_FINAL)
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save(self):
        torch.save(self.actor_local.state_dict(),
                   'actor{}.pth'.format(self.agent_id))
        torch.save(self.critic_local.state_dict(),
                   'critic{}.pth'.format(self.agent_id))

    def load(self):
        self.actor_local.load_state_dict(
            torch.load('actor{}.pth'.format(self.agent_id)))
        self.critic_local.load_state_dict(
            torch.load('critic{}.pth'.format(self.agent_id)))
Exemplo n.º 17
0
class DDPG_Agent:
    def __init__(self, state_size, action_size, seed, index=0, num_agents=2):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int):   Dimension of each state
            action_size (int):  Dimension of each action
            seed (int):         Random seed
            index (int):        Index assigned to the agent
            num_agents (int):   Number of agents in the environment
        """

        self.state_size = state_size  # State size
        self.action_size = action_size  # Action size
        self.seed = torch.manual_seed(seed)  # Random seed
        self.index = index  # Index of this agent, not used at the moment
        self.tau = TAU  # Parameter for soft weight update
        self.num_updates = N_UPDATES  # Number of updates to perform when updating
        self.num_agents = num_agents  # Number of agents in the environment
        self.tstep = 0  # Simulation step (modulo (%) UPDATE_EVERY)
        self.gamma = GAMMA  # Gamma for the reward discount
        self.alpha = ALPHA  # PER: toggle prioritization (0..1)

        # Set up actor and critic networks
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Ornstein-Uhlenbeck noise
        self.noise = OUNoise((1, action_size), seed)

        # Replay buffer
        self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE,
                                              BATCH_SIZE, seed, self.alpha)

    # act and act_targets similar to exercises and MADDPG Lab
    def act(self, states, noise=1.0):
        """Returns actions for given state as per current policy.
    
        Params
        ======
            state [n_agents, state_size]: current state
            noise (float):    control whether or not noise is added
        """
        # Uncomment if state is numpy array instead of tensor
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((1, self.action_size))

        # Put model into evaluation mode
        self.actor_local.eval()

        # Get actions for current state, transformed from probabilities
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()

        # Put actor back into training mode
        self.actor_local.train()

        # Ornstein-Uhlenbeck noise addition
        actions += noise * self.noise.sample()

        #  Transform probability into valid action ranges
        return np.clip(actions, -1, 1)

    def step(self, states, actions, rewards, next_states, dones, beta):
        """Save experience in replay memory, use random samples from buffer to learn.
        
        PARAMS
        ======
            states:     [n_agents, state_size]  current state
            actions:    [n_agents, action_size] taken action
            rewards:    [n_agents]              earned reward
            next_states:[n_agents, state_size]  next state
            dones:      [n_agents]              Whether episode has finished
            beta:       [0..1]                  PER: toggles correction for importance weights (0 - no corrections, 1 - full correction)
        """
        # ------------------------------------------------------------------
        # Save experience in replay memory - slightly more effort due to Prioritization
        # We need to calculate priorities for the experience tuple.
        # This is in our case (Q_expected - Q_target)**2
        # -----------------------------------------------------------------
        # Set all networks to evaluation mode
        self.actor_target.eval()
        self.critic_target.eval()
        self.critic_local.eval()

        state = torch.from_numpy(states).float().to(device)
        next_state = torch.from_numpy(next_states).float().to(device)
        action = torch.from_numpy(actions).float().to(device)
        #reward = torch.from_numpy(rewards).float().to(device)
        #done = torch.from_numpy(dones).float().to(device)

        with torch.no_grad():
            next_actions = self.actor_target(state)
            own_action = action[:, self.index *
                                self.action_size:(self.index + 1) *
                                self.action_size]
            if self.index:
                # Agent 1
                next_actions_agent = torch.cat((own_action, next_actions),
                                               dim=1)
            else:
                # Agent 0: flipped order
                next_actions_agent = torch.cat((next_actions, own_action),
                                               dim=1)

            # Predicted Q value from Critic target network
            Q_targets_next = self.critic_target(next_state,
                                                next_actions_agent).float()
            #print(f"Type Q_t_n: {type(Q_targets_next)}")
            #print(f"Type gamma: {type(self.gamma)}")
            #print(f"Type dones: {type(dones)}")
            Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
            Q_expected = self.critic_local(state, action)

        # Use error between Q_expected and Q_targets as priority in buffer
        error = (Q_expected - Q_targets)**2
        self.memory.add(state, action, rewards, next_state, dones, error)

        # Set all networks back to training mode
        self.actor_target.train()
        self.critic_target.train()
        self.critic_local.train()

        # ------------------------------------------------------------------
        # Usual learning procedure
        # -----------------------------------------------------------------
        # Learn every UPDATE_EVERY time steps
        self.tstep = (self.tstep + 1) % UPDATE_EVERY

        # If UPDATE_EVERY and enough samples are available in memory, get random subset and learn
        if self.tstep == 0 and len(self.memory) > BATCH_SIZE:
            for _ in range(self.num_updates):
                experiences = self.memory.sample(beta)
                self.learn(experiences)

    def reset(self):
        """Reset the noise parameter of the agent."""
        self.noise.reset()

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples. 
        Update according to 
            Q_targets = r + gamma * critic_target(next_state, actor_target(next_state))
        
        According to the lessons: 
            actor_target  (state)           gives   action
            critic_target (state, action)   gives   Q-value

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of 
                    states          states visited
                    actions         actions taken by all agents
                    rewards         rewards received
                    next states     all next states
                    dones           whether or not a final state is reached 
                    weights         weights of the experiences
                    indices         indices of the experiences            
        """

        # Load experiences from sample
        states, actions, rewards, next_states, dones, weights_cur, indices = experiences

        # ------------------- update critic ------------------- #

        # Get next actions via actor network
        next_actions = self.actor_target(next_states)

        # Stack action together with action of the agent
        own_actions = actions[:,
                              self.index * self.action_size:(self.index + 1) *
                              self.action_size]
        if self.index:
            # Agent 1
            next_actions_agent = torch.cat((own_actions, next_actions), dim=1)
        else:
            # Agent 0: flipped order
            next_actions_agent = torch.cat((next_actions, own_actions), dim=1)

        # Predicted Q value from Critic target network
        Q_targets_next = self.critic_target(next_states, next_actions_agent)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        Q_expected = self.critic_local(states, actions)

        # Update priorities in ReplayBuffer
        loss = (Q_expected - Q_targets).pow(2).reshape(
            weights_cur.shape) * weights_cur
        self.memory.update(indices, loss.data.cpu().numpy())

        # Compute critic loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Clip gradients
        #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), GRAD_CLIPPING)
        self.critic_optimizer.step()

        # ------------------- update actor ------------------- #
        actions_expected = self.actor_local(states)

        # Stack action together with action of the agent
        own_actions = actions[:,
                              self.index * self.action_size:(self.index + 1) *
                              self.action_size]
        if self.index:
            # Agent 1:
            actions_expected_agent = torch.cat((own_actions, actions_expected),
                                               dim=1)
        else:
            # Agent 0: flipped order
            actions_expected_agent = torch.cat((actions_expected, own_actions),
                                               dim=1)

        # Compute actor loss based on expectation from actions_expected
        actor_loss = -self.critic_local(states, actions_expected_agent).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks
        self.target_soft_update(self.critic_local, self.critic_target)
        self.target_soft_update(self.actor_local, self.actor_target)

    def target_soft_update(self, local_model, target_model):
        """Soft update model parameters for actor and critic of all MADDPG agents.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        """

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)

    def save(self, filename):
        """Saves the agent to the local workplace

        Params
        ======
            filename (string): where to save the weights
        """

        checkpoint = {
            'input_size':
            self.state_size,
            'output_size':
            self.action_size,
            'actor_hidden_layers': [
                each.out_features for each in self.actor_local.hidden_layers
                if each._get_name() != 'BatchNorm1d'
            ],
            'actor_state_dict':
            self.actor_local.state_dict(),
            'critic_hidden_layers': [
                each.out_features for each in self.critic_local.hidden_layers
                if each._get_name() != 'BatchNorm1d'
            ],
            'critic_state_dict':
            self.critic_local.state_dict()
        }

        torch.save(checkpoint, filename)

    def load_weights(self, filename):
        """ Load weights to update agent's actor and critic networks.
        Expected is a format like the one produced by self.save()

        Params
        ======
            filename (string): where to load data from. 
        """
        checkpoint = torch.load(filename)
        if not checkpoint['input_size'] == self.state_size:
            print(
                f"Error when loading weights from checkpoint {filename}: input size {checkpoint['input_size']} doesn't match state size of agent {self.state_size}"
            )
            return None
        if not checkpoint['output_size'] == self.action_size:
            print(
                f"Error when loading weights from checkpoint {filename}: output size {checkpoint['output_size']} doesn't match action space size of agent {self.action_size}"
            )
            return None
        my_actor_hidden_layers = [
            each.out_features for each in self.actor_local.hidden_layers
            if each._get_name() != 'BatchNorm1d'
        ]
        if not checkpoint['actor_hidden_layers'] == my_actor_hidden_layers:
            print(
                f"Error when loading weights from checkpoint {filename}: actor hidden layers {checkpoint['actor_hidden_layers']} don't match agent's actor hidden layers {my_actor_hidden_layers}"
            )
            return None
        my_critic_hidden_layers = [
            each.out_features for each in self.critic_local.hidden_layers
            if each._get_name() != 'BatchNorm1d'
        ]
        if not checkpoint['critic_hidden_layers'] == my_critic_hidden_layers:
            print(
                f"Error when loading weights from checkpoint {filename}: critic hidden layers {checkpoint['critic_hidden_layers']} don't match agent's critic hidden layers {my_critic_hidden_layers}"
            )
            return None
        self.actor_local.load_state_dict(checkpoint['actor_state_dict'])
        self.critic_local.load_state_dict(checkpoint['critic_state_dict'])
Exemplo n.º 18
0
class MiADDPG():
    """Multiple independent Agents trained with DDPG

    This class allows to shared experience-buffer and critic network of the
    class::Agent.
    """
    def __init__(self,
                 num_agents,
                 state_size,
                 action_size,
                 random_seed,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0,
                 tau=1e-3,
                 gamma=0.99,
                 batch_size=128,
                 buffer_size=int(1e5),
                 share_critic=True,
                 share_buffer=True):
        """Initialize an multi-agent wrapper

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            tau (float): control soft-update
            gamma (float): discount factor
            batch_size (int): size of training batch
            buffer_size (int) : cap on number of experiences
        """
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.tau = tau
        self.gamma = gamma

        self.agents = [
            Agent(state_size,
                  action_size,
                  random_seed,
                  lr_actor=1e-4,
                  lr_critic=1e-3,
                  weight_decay=0) for i in range(num_agents)
        ]
        self.share_critic = share_critic
        if share_critic:
            self.critic_local = Critic(state_size, action_size,
                                       random_seed).to(device)
            self.critic_target = Critic(state_size, action_size,
                                        random_seed).to(device)
            self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                               lr=lr_critic,
                                               weight_decay=weight_decay)
            for agent in self.agents:
                agent.critic_local = None
                agent.critic_target = None
                agent.critic_optimizer = None

        self.share_buffer = share_buffer
        num_buffer = num_agents
        if share_buffer:
            num_buffer = 2
        self.memory = [
            ReplayBuffer(buffer_size, batch_size) for i in range(num_buffer)
        ]

    def step(self, state, action, reward, next_state, done):
        "Save experience and random sample from buffer to learn"
        # Save experience / reward in replay memory
        for i in range(len(state)):
            ind = i
            if self.share_buffer:
                ind = 0
            self.memory[i].add(state[i, ...], action[i, ...], reward[i],
                               next_state[i, ...], done[i])

        # Learn, if enough samples are available in memory
        c_i = random.randint(0, len(self.agents) - 1)
        for i, agent in enumerate(self.agents):
            update_critic = True
            if self.share_critic and i != c_i:
                update_critic = False

            ind = i
            if self.share_buffer:
                ind = 0
            if len(self.memory[ind]) < self.batch_size:
                continue

            experiences = self.memory[ind].sample()
            self.learn(agent, experiences, self.gamma, update_critic)

    def act(self, state, add_noise=True):
        "Returns actions for given state as per current policy"
        state = torch.from_numpy(state).float().to(device)
        action_list = []
        for i, agent in enumerate(self.agents):
            action_list.append(agent.act(state[[i], ...]))
        return np.concatenate(action_list, axis=0)

    def load(self, filename, map_location=None):
        "Load weights for actor and critic"
        weights = torch.load(filename, map_location=map_location)
        for i, agent in enumerate(self.agents):
            agent.load_state_dict(weights[f'actor_{i}'])
            if self.share_critic:
                self.critic_local.load_state_dict(weights['critic'])
                continue
            agent.load_state_dict(weights[f'critic_{i}'])

    def reset(self):
        self.noise.reset()

    def save(self, filename='checkpoint.pth'):
        "Serialize actor and critic weights"
        checkpoint = {}
        for i, agent in enumerate(self.agents):
            checkpoint[f'actor_{i}'] = agent.actor_local.state_dict()
            if not self.share_critic:
                checkpoint[f'critic_{i}'] = agent.critic_local.state_dict()
        if self.share_critic:
            checkpoint[f'critic'] = self.critic_local.state_dict()
        torch.save(checkpoint, filename)

    def learn(self, agent, experiences, gamma, update_critic=True):
        """Update policy and value parameters with a batch of experiences

        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done)
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        critic_target = agent.critic_target
        critic_local = agent.critic_local
        critic_optimizer = agent.critic_optimizer
        if self.share_critic:
            critic_target = self.critic_target
            critic_local = self.critic_local
            critic_optimizer = self.critic_optimizer

        # Update critic
        # Get predicted next-state actions and Q values from target models
        actions_next = agent.actor_target(next_states)
        Q_targets_next = critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        if update_critic:
            critic_optimizer.zero_grad()
            critic_loss.backward()
            critic_optimizer.step()

        # Update actor
        # Compute actor loss
        actions_pred = agent.actor_local(states)
        actor_loss = -critic_local(states, actions_pred).mean()
        # Minimize the loss
        agent.actor_optimizer.zero_grad()
        actor_loss.backward()
        agent.actor_optimizer.step()

        # Update target networks
        soft_update(critic_local, critic_target, self.tau)
        soft_update(agent.actor_local, agent.actor_target, self.tau)
Exemplo n.º 19
0
class Academy:
    def __init__(self, state_size, action_size, random_seed, memory):
        # checked
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        self.checkpoint_file_path = './checkpoint_critic.pth'
        if os.path.isfile(self.checkpoint_file_path):
            self.critic_local.load_state_dict(
                torch.load(self.checkpoint_file_path))
            self.critic_target.load_state_dict(
                torch.load(self.checkpoint_file_path))

    def step(self, actor, memory):
        # unchecked

        # Learn, if enough samples are available in memory
        if len(memory) > BATCH_SIZE:
            experiences = memory.sample()
            self.learn(actor, experiences, GAMMA)

    def learn(self, actor, experiences, gamma):
        # checked
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = actor.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = actor.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        actor.actor_optimizer.zero_grad()
        actor_loss.backward()
        actor.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(actor.actor_local, actor.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        # checked
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Multi_Agents():
    """
    Implements interactions and learning on environments for a set of agents
    """
    def __init__(self, agents_count, state_size, action_size, random_seed,
                 buffer_size, batch_size, gamma, fc1_units, fc2_units, noise,
                 lr_actor, lr_critic):
        """Initialize a Multi_Agent.

        Params
        ======
            agents_count (int): the number of agents
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            buffer_size(int): replay buffer size
            gamma(float): discount factor
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
            noise(Object): The noise applied to the actions selection
            lr_actor(float) : learning rates of the actor
            lr_critic(float) : learning rates of the critic
        """

        self.agents_count = agents_count
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.gamma = gamma
        self.batch_size = batch_size

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed,
                                 fc1_units, fc2_units).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed,
                                  fc1_units, fc2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed,
                                   fc1_units, fc2_units).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed,
                                    fc1_units, fc2_units).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=WEIGHT_DECAY)

        # after reading implementating of ShangtongZhang as suggested in the course,
        # It seems relevant to initialize the weights of the target networks
        # with the same values as the local network :
        self.actor_target.load_state_dict(self.actor_local.state_dict())
        self.critic_target.load_state_dict(self.critic_local.state_dict())

        # Noise process
        self.noise = noise

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed)

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        for a in range(self.agents_count):
            # save for each agent
            self.memory.add(states[a], actions[a], rewards[a], next_states[a],
                            dones[a])

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

    def act(self, states, add_noise=True):
        """Returns actions for each given state of each agent as per current policy."""

        states = torch.from_numpy(states).float().to(device)
        actions = np.empty([self.agents_count, self.action_size])

        self.actor_local.eval()
        with torch.no_grad():
            for a in range(self.agents_count):
                actions[a] = self.actor_local(states[a]).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # as suggested in the "Benchmak implementation" section of the course"
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 21
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic1_local = Critic(state_size, action_size).to(device)
        self.critic1_target = Critic(state_size, action_size).to(device)
        self.critic1_optimizer = optim.Adam(self.critic1_local.parameters(),
                                            lr=LR_CRITIC,
                                            weight_decay=WEIGHT_DECAY)

        self.critic2_local = Critic(state_size, action_size).to(device)
        self.critic2_target = Critic(state_size, action_size).to(device)
        self.critic2_optimizer = optim.Adam(self.critic2_local.parameters(),
                                            lr=LR_CRITIC,
                                            weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size)

        # Replay memory
        self.memory = PER(BUFFER_SIZE)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory."""
        # Set reward as initial priority, see:
        #   https://jaromiru.com/2016/11/07/lets-make-a-dqn-double-learning-and-prioritized-experience-replay/
        self.memory.add((state, action, reward, next_state, done), reward)

    def act(self, state):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        action += self.noise.sample()
        return np.clip(action, -1., 1.)

    def reset(self):
        self.noise.reset()

    def mse(self, expected, targets, is_weights):
        """Custom loss function that takes into account the importance-sampling weights."""
        td_error = expected - targets
        weighted_squared_error = is_weights * td_error * td_error
        return torch.sum(weighted_squared_error) / torch.numel(
            weighted_squared_error)

    def learn(self):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        """
        for i in range(1, LEARN_BATCH + 1):
            idxs, experiences, is_weights = self.memory.sample(BATCH_SIZE)

            states = torch.from_numpy(
                np.vstack([e[0] for e in experiences
                           if e is not None])).float().to(device)
            actions = torch.from_numpy(
                np.vstack([e[1] for e in experiences
                           if e is not None])).float().to(device)
            rewards = torch.from_numpy(
                np.vstack([e[2] for e in experiences
                           if e is not None])).float().to(device)
            next_states = torch.from_numpy(
                np.vstack([e[3] for e in experiences
                           if e is not None])).float().to(device)
            dones = torch.from_numpy(
                np.vstack([e[4] for e in experiences if e is not None
                           ]).astype(np.uint8)).float().to(device)

            is_weights = torch.from_numpy(is_weights).float().to(device)

            # ---------------------------- update critic ---------------------------- #
            # Target Policy Smoothing Regularization: add a small amount of clipped random noises to the selected action
            if POLICY_NOISE > 0.0:
                noise = torch.empty_like(actions).data.normal_(
                    0, POLICY_NOISE).to(device)
                noise = noise.clamp(-POLICY_NOISE_CLIP, POLICY_NOISE_CLIP)
                # Get predicted next-state actions and Q values from target models
                actions_next = (self.actor_target(next_states) + noise).clamp(
                    -1., 1.)
            else:
                # Get predicted next-state actions and Q values from target models
                actions_next = self.actor_target(next_states)

            # Error Mitigation
            Q_targets_next = torch.min(\
                self.critic1_target(next_states, actions_next), \
                self.critic2_target(next_states, actions_next)).detach()

            # Compute Q targets for current states (y_i)
            Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

            # Compute critic1 loss
            Q_expected = self.critic1_local(states, actions)
            errors1 = np.abs((Q_expected - Q_targets).detach().cpu().numpy())
            critic1_loss = self.mse(Q_expected, Q_targets, is_weights)
            # Minimize the loss
            self.critic1_optimizer.zero_grad()
            critic1_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.critic1_local.parameters(), 1)
            self.critic1_optimizer.step()

            # Update priorities in the replay buffer
            self.memory.batch_update(idxs, errors1)

            # Compute critic2 loss
            Q_expected = self.critic2_local(states, actions)
            critic2_loss = self.mse(Q_expected, Q_targets, is_weights)
            # Minimize the loss
            self.critic2_optimizer.zero_grad()
            critic2_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.critic2_local.parameters(), 1)
            self.critic2_optimizer.step()

            # Delayed Policy Updates
            if i % UPDATE_ACTOR_EVERY == 0:
                # ---------------------------- update actor ---------------------------- #
                # Compute actor loss
                actions_pred = self.actor_local(states)
                actor_loss = -self.critic1_local(states, actions_pred).mean()
                # Minimize the loss
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # ----------------------- update target networks ----------------------- #
                self.soft_update(self.critic1_local, self.critic1_target, TAU)
                self.soft_update(self.critic2_local, self.critic2_target, TAU)
                self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save_weights(self):
        torch.save(self.actor_local.state_dict(), actor_weights_file)
        torch.save(self.critic1_local.state_dict(), critic1_weights_file)
        torch.save(self.critic2_local.state_dict(), critic2_weights_file)

    def load_weights(self):
        self.actor_local.load_state_dict(torch.load(actor_weights_file))
        self.critic1_local.load_state_dict(torch.load(critic1_weights_file))
        self.critic2_local.load_state_dict(torch.load(critic2_weights_file))
Exemplo n.º 22
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args):
        self.num_actor = 3

        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete
        self.pic = args.pic
        if self.pic:
            self.nb_status = args.pic_status

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'use_bn': args.bn
        }
        if args.pic:
            self.cnn = CNN(3, args.pic_status)
            self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
        self.actors = [
            Actor(self.nb_status, self.nb_actions)
            for _ in range(self.num_actor)
        ]
        self.actor_targets = [
            Actor(self.nb_status, self.nb_actions)
            for _ in range(self.num_actor)
        ]
        self.actor_optims = [
            Adam(self.actors[i].parameters(), lr=args.prate)
            for i in range(self.num_actor)
        ]

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        for i in range(self.num_actor):
            hard_update(
                self.actor_targets[i],
                self.actors[i])  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = rpm(
            args.rmsize
        )  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.use_cuda = args.cuda
        #
        if self.use_cuda: self.cuda()

    def normalize(self, pic):
        pic = pic.swapaxes(0, 2).swapaxes(1, 2)
        return pic

    def update_policy(self, train_actor=True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        if self.pic:
            state_batch = np.array([self.normalize(x) for x in state_batch])
            state_batch = to_tensor(state_batch, volatile=True)
            print('label 1')
            print('size = ', state_batch.shape)
            state_batch = self.cnn(state_batch)
            print('label 2')
            next_state_batch = np.array(
                [self.normalize(x) for x in next_state_batch])
            next_state_batch = to_tensor(next_state_batch, volatile=True)
            next_state_batch = self.cnn(next_state_batch)
            next_q_values = self.critic_target(
                [next_state_batch,
                 self.actor_target(next_state_batch)])
        else:
            index = np.random.randint(low=0, high=self.num_actor)
            next_q_values = self.critic_target([
                to_tensor(next_state_batch, volatile=True),
                self.actor_targets[index](to_tensor(next_state_batch,
                                                    volatile=True)),
            ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            q_batch = self.critic([state_batch, to_tensor(action_batch)])
        else:
            q_batch = self.critic(
                [to_tensor(state_batch),
                 to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()
        if self.pic: self.cnn_optim.step()

        sum_policy_loss = 0
        for i in range(self.num_actor):
            self.actors[i].zero_grad()

            policy_loss = -self.critic([
                to_tensor(state_batch), self.actors[i](to_tensor(state_batch))
            ])

            policy_loss = policy_loss.mean()
            policy_loss.backward()
            if train_actor:
                self.actor_optims[i].step()
            sum_policy_loss += policy_loss

            # Target update
            soft_update(self.actor_targets[i], self.actors[i], self.tau)

        soft_update(self.critic_target, self.critic, self.tau)

        return -sum_policy_loss / self.num_actor, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def cuda(self):
        for i in range(self.num_actor):
            self.actors[i].cuda()
            self.actor_targets[i].cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def select_action(self,
                      s_t,
                      decay_epsilon=True,
                      return_fix=False,
                      noise_level=0):
        actions = []
        status = []
        tot_score = []
        for i in range(self.num_actor):
            action = to_numpy(self.actors[i](to_tensor(
                np.array([s_t]), volatile=True))).squeeze(0)
            noise_level = noise_level * max(self.epsilon, 0)
            action = action + self.random_process.sample() * noise_level
            status.append(s_t)
            actions.append(action)
            tot_score.append(0.)

        scores = self.critic([
            to_tensor(np.array(status), volatile=True),
            to_tensor(np.array(actions), volatile=True)
        ])
        for j in range(self.num_actor):
            tot_score[j] += scores.data[j][0]
        best = np.array(tot_score).argmax()

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = actions[best]
        return actions[best]

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=0):
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num)))
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num)))
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num)))
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num)))

    def save_model(self, output, num):
        if self.use_cuda:
            self.actor.cpu()
            self.critic.cpu()
        torch.save(self.actor.state_dict(),
                   '{}/actor{}.pkl'.format(output, num))
        torch.save(self.critic.state_dict(),
                   '{}/critic{}.pkl'.format(output, num))
        if self.use_cuda:
            self.actor.cuda()
            self.critic.cuda()
Exemplo n.º 23
0
        'valid_critic_loss': 0.0,
        'valid_generator_loss': 0.0
    }

    train_loader, valid_loader, test_loader = svhn_sampler(data_root, train_batch_size, test_batch_size)
    train_loader, valid_loader, test_loader = repeater(train_loader), repeater(valid_loader), repeater(test_loader)
    train_iter, valid_iter, test_iter = iter(train_loader), iter(valid_loader), iter(test_loader)

    generator = Generator(z_dim=z_dim).to(device)
    critic = Critic().to(device)

    optim_critic = optim.Adam(critic.parameters(), lr=lr, betas=(beta1, beta2))
    optim_generator = optim.Adam(generator.parameters(), lr=lr, betas=(beta1, beta2))

    checkpoint = torch.load('save.tar')
    critic.load_state_dict(checkpoint['critic'])
    generator.load_state_dict(checkpoint['generator'])
    optim_critic.load_state_dict(checkpoint['optim_critic'])
    optim_generator.load_state_dict(checkpoint['optim_generator'])


    for i in range(n_iter*n_critic_updates):
        generator.train()
        critic.train()

        # update critic
        x = next(train_iter)[0].to(device)
        noise = torch.randn(train_batch_size, z_dim).to(device)
        y = generator(noise).detach()
        optim_critic.zero_grad()
        score = (-distances.vf_wasserstein_distance(x, y, critic))
Exemplo n.º 24
0
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size=int(1e5),
                 batch_size=256,
                 learn_every=1,
                 update_every=1,
                 gamma=0.99,
                 tau=0.02,
                 lr_actor=2e-4,
                 lr_critic=2e-3,
                 random_seed=None,
                 use_asn=True,
                 asn_kwargs={},
                 use_psn=False,
                 psn_kwargs={},
                 use_per=False,
                 restore=None):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.update_every = update_every
        self.learn_every = learn_every
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        # Keep track of how many times we've updated weights
        self.i_updates = 0
        self.i_step = 0
        self.use_asn = use_asn
        self.use_psn = use_psn
        self.use_per = use_per

        if random_seed is not None:
            random.seed(random_seed)

        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        if self.use_psn:
            self.actor_perturbed = Actor(state_size, action_size).to(device)
        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)

        # restore networks if needed
        if restore is not None:
            checkpoint = torch.load(restore, map_location=device)
            self.actor_local.load_state_dict(checkpoint[0]['actor'])
            self.actor_target.load_state_dict(checkpoint[0]['actor'])
            if self.use_psn:
                self.actor_perturbed.load_state_dict(checkpoint[0]['actor'])
            self.critic_local.load_state_dict(checkpoint[0]['critic'])
            self.critic_target.load_state_dict(checkpoint[0]['critic'])

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic)

        # Hard copy weights from local to target networks
        policy_update(self.actor_local, self.actor_target, 1.0)
        policy_update(self.critic_local, self.critic_target, 1.0)

        # Noise process
        if self.use_asn:
            self.action_noise = OUNoise(action_size, **asn_kwargs)

        if self.use_psn:
            self.param_noise = ParameterSpaceNoise(**psn_kwargs)

        if self.use_per:
            self.buffer = PrioritizedExperienceReplay(buffer_size, batch_size,
                                                      random_seed)
        else:
            self.buffer = ExperienceReplay(buffer_size, batch_size,
                                           random_seed)

    def act(self, states, perturb_mode=True, train_mode=True):
        """Returns actions for given state as per current policy."""
        if not train_mode:
            self.actor_local.eval()
            if self.use_psn:
                self.actor_perturbed.eval()

        with torch.no_grad():
            states = torch.from_numpy(states).float().to(device)
            actor = self.actor_perturbed if (
                self.use_psn and perturb_mode) else self.actor_local
            actions = actor(states).cpu().numpy()[0]

        if train_mode:
            actions += self.action_noise.sample()

        self.actor_local.train()
        if self.use_psn:
            self.actor_perturbed.train()

        return np.clip(actions, -1, 1)

    def perturb_actor_parameters(self):
        """Apply parameter space noise to actor model, for exploration"""
        policy_update(self.actor_local, self.actor_perturbed, 1.0)
        params = self.actor_perturbed.state_dict()
        for name in params:
            if 'ln' in name:
                pass
            param = params[name]
            random = torch.randn(param.shape)
            if use_cuda:
                random = random.cuda()
            param += random * self.param_noise.current_stddev

    def reset(self):
        self.action_noise.reset()
        if self.use_psn:
            self.perturb_actor_parameters()

    def step(self, experience, priority=0.0):
        self.buffer.push(experience)
        self.i_step += 1
        if len(self.buffer) > self.batch_size:
            if self.i_step % self.learn_every == 0:
                self.learn(priority)
            if self.i_step % self.update_every == 0:
                self.update(
                )  # soft update the target network towards the actual networks

    def learn(self, priority=0.0):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        if self.use_per:
            (states, actions, rewards, states_next,
             dones), batch_idx = self.buffer.sample(priority)
        else:
            states, actions, rewards, states_next, dones = self.buffer.sample()

        # Get predicted next-state actions and Q values from target models
        with torch.no_grad():
            actions_next = self.actor_target(states_next)
            Q_targets_next = self.critic_target(states_next, actions_next)
            Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # ---------------------------- update critic ---------------------------- #
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.smooth_l1_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_local.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_local.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        if self.use_per:
            Q_error = Q_expected - Q_targets
            new_deltas = torch.abs(Q_error.detach().squeeze(1)).numpy()
            self.buffer.update_deltas(batch_idx, new_deltas)

    def update(self):
        """soft update targets"""
        self.i_updates += 1
        policy_update(self.actor_local, self.actor_target, self.tau)
        policy_update(self.critic_local, self.critic_target, self.tau)

    def save_model(self, model_dir, session_name, i_episode, best):

        filename = os.path.join(
            model_dir,
            f'ddpg_{session_name}-EP_{i_episode}-score_{best:.3f}.pt')
        filename_best = os.path.join(model_dir, f'ddpg_{session_name}-best.pt')
        save_dict_list = []
        save_dict = {
            'actor': self.actor_local.state_dict(),
            'actor_optim_params': self.actor_optimizer.state_dict(),
            'critic': self.critic_local.state_dict(),
            'critic_optim_params': self.critic_optimizer.state_dict()
        }
        save_dict_list.append(save_dict)
        torch.save(save_dict_list, filename)
        copyfile(filename, filename_best)

    def postprocess(self, t_step):
        if self.use_psn and t_step > 0:
            perturbed_states, perturbed_actions, _, _, _ = self.buffer.tail(
                t_step)
            unperturbed_actions = self.act(np.array(perturbed_states), False,
                                           False)
            diff = np.array(perturbed_actions) - unperturbed_actions
            mean_diff = np.mean(np.square(diff), axis=0)
            dist = sqrt(np.mean(mean_diff))
            self.param_noise.adapt(dist)
Exemplo n.º 25
0
    print('action size:', num_actions)

    writer = SummaryWriter(args.logdir)

    actor = Actor(num_inputs, num_actions)
    critic = Critic(num_inputs)

    running_state = ZFilter((num_inputs, ), clip=5)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model',
                                       str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    actor_optim = optim.Adam(actor.parameters(), lr=hp.actor_lr)
    critic_optim = optim.Adam(critic.parameters(),
                              lr=hp.critic_lr,
                              weight_decay=hp.l2_rate)

    episodes = 0
    for iter in range(15000):
        actor.eval(), critic.eval()
Exemplo n.º 26
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete
        self.pic = args.pic
        self.writer = writer
        self.select_time = 0        
        if self.pic:
            self.nb_status = args.pic_status
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'use_bn':args.bn,
            'init_method':args.init_method
        }
        if args.pic:
            self.cnn = CNN(1, args.pic_status)
            self.cnn_target = CNN(1, args.pic_status)
            self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        if args.pic:
            hard_update(self.cnn_target, self.cnn)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def normalize(self, pic):
        pic = pic.swapaxes(0, 2).swapaxes(1, 2)
        return pic

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        if self.pic:
            state_batch = np.array([self.normalize(x) for x in state_batch])
            state_batch = to_tensor(state_batch, volatile=True)
            state_batch = self.cnn(state_batch)
            next_state_batch = np.array([self.normalize(x) for x in next_state_batch])
            next_state_batch = to_tensor(next_state_batch, volatile=True)
            next_state_batch = self.cnn_target(next_state_batch)
            next_q_values = self.critic_target([
                next_state_batch,
                self.actor_target(next_state_batch)
            ])
        else:
            next_q_values = self.critic_target([
                to_tensor(next_state_batch, volatile=True),
                self.actor_target(to_tensor(next_state_batch, volatile=True)),
            ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            q_batch = self.critic([state_batch, to_tensor(action_batch)])
        else:
            q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()
        if self.pic: self.cnn_optim.step()

        self.actor.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            policy_loss = -self.critic([
                state_batch,
                self.actor(state_batch)
            ])
        else:
            policy_loss = -self.critic([
                to_tensor(state_batch),
                self.actor(to_tensor(state_batch))
            ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time)

        self.actor_optim.step()
        if self.pic: self.cnn_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        if self.pic:
            soft_update(self.cnn_target, self.cnn, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()
        if(self.pic):
            self.cnn.eval()
            self.cnn_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()
        if(self.pic):
            self.cnn.train()
            self.cnn_target.train()

    def cuda(self):
        self.cnn.cuda()
        self.cnn_target.cuda()
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self, fix=False):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        if self.discrete and fix == False:
            action = action.argmax()
#        if self.pic:
#            action = np.concatenate((softmax(action[:16]), softmax(action[16:])))
        return action
        
    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        self.eval()
        if self.pic:
            s_t = self.normalize(s_t)
            s_t = self.cnn(to_tensor(np.array([s_t])))
        if self.pic:
            action = to_numpy(
                self.actor_target(s_t)
            ).squeeze(0)
        else:
            action = to_numpy(
                self.actor(to_tensor(np.array([s_t])))
            ).squeeze(0)
        self.train()
        noise_level = noise_level * max(self.epsilon, 0)

        if np.random.uniform(0, 1) < noise_level:
            action = self.random_action(fix=True) # episilon greedy            

        if decay_epsilon:
            self.epsilon -= self.depsilon
        self.a_t = action
        
        if return_fix:
            return action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):        
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            self.cnn.cpu()
            self.actor.cpu()
            self.critic.cpu()
        torch.save(
            self.actor.state_dict(),
            '{}/actor{}.pkl'.format(output, num)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            self.cnn.cuda()
            self.actor.cuda()
            self.critic.cuda()
Exemplo n.º 27
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 device,
                 gamma=GAMMA,
                 tau=TAU,
                 lr_actor=LR_ACTOR,
                 lr_critic=LR_CRITIC,
                 random_seed=0):
        """
            Initialize an Agent object.
        :param state_size: size of state
        :param action_size: size of action
        :param num_agents: number of agents
        :param gamma: discount factor
        :param tau: factor for soft update of target parameters
        :param lr_actor: Learning rate of actor
        :param lr_critic: Learning rate of critic
        :param random_seed: Random seed
        :param device: cuda or cpu
        """

        self.device = device
        self.gamma = gamma
        self.tau = tau

        self.num_agents = num_agents

        self.state_size = state_size
        self.action_size = action_size
        self.full_state_size = state_size * num_agents
        self.full_action_size = action_size * num_agents
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, device,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, device,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(self.full_state_size,
                                   self.full_action_size,
                                   device=device,
                                   random_seed=random_seed).to(device)
        self.critic_target = Critic(self.full_state_size,
                                    self.full_action_size,
                                    device=device,
                                    random_seed=random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=0)

        self.noise = OUNoise(action_size, random_seed)

    def save_model(self, agent_number):
        torch.save(self.actor_local.state_dict(),
                   f'models/checkpoint_actor_{agent_number}.pth')
        torch.save(self.critic_local.state_dict(),
                   f'models/checkpoint_critic_{agent_number}.pth')

    def load_model(self, agent_number):
        checkpoint = torch.load(f'models/checkpoint_actor_{agent_number}.pth',
                                map_location=torch.device('cpu'))
        self.actor_local.load_state_dict(checkpoint)

        checkpoint = torch.load(f'models/checkpoint_critic_{agent_number}.pth',
                                map_location=torch.device('cpu'))
        self.critic_local.load_state_dict(checkpoint)

    def act(self, state, noise=0., train=False):
        """Returns actions for given state as per current policy.
        :param state: state as seen from single agent
        """

        if train is True:
            self.actor_local.train()
        else:
            self.actor_local.eval()

        action = self.actor_local(state)
        if noise > 0:
            noise = torch.tensor(noise * self.noise.sample(),
                                 dtype=state.dtype,
                                 device=state.device)
        return action + noise

    def target_act(self, state, noise=0.):
        #self.actor_target.eval()
        # convert to cpu() since noise is in cpu()
        self.actor_target.eval()
        action = self.actor_target(state).cpu()
        if noise > 0.:
            noise = torch.tensor(noise * self.noise.sample(),
                                 dtype=state.dtype,
                                 device=state.device)
        return action + noise

    def update_critic(self, rewards, dones, all_states, all_actions,
                      all_next_states, all_next_actions):
        with torch.no_grad():
            Q_targets_next = self.critic_target(all_next_states,
                                                all_next_actions)
            # Compute Q targets for current states (y_i)
        q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        q_expected = self.critic_local(all_states, all_actions)
        # critic_loss = F.mse_loss(q_expected, q_targets)
        critic_loss = ((q_expected - q_targets.detach())**2).mean()
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

    def update_actor(self, all_states, all_predicted_actions):
        """Update actor network

        :param all_states: all states
        :param all_predicted_actions: all predicted actions
        """
        actor_loss = -self.critic_local(all_states,
                                        all_predicted_actions).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor_optimizer.step()

    def update_targets(self):
        self.soft_update(self.actor_local, self.actor_target, self.tau)
        self.soft_update(self.critic_local, self.critic_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def reset(self):
        self.noise.reset()
Exemplo n.º 28
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 memory=None,
                 buffer_size=BUFFER_SIZE,
                 batch_size=BATCH_SIZE,
                 gamma=GAMMA,
                 tau=TAU,
                 lr_actor=LR_ACTOR,
                 lr_critic=LR_CRITIC,
                 weigth_decay=WEIGHT_DECAY,
                 pretrained_actor_weights=None,
                 pretrained_critic_weights=None):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weigth_decay

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic,
                                           weight_decay=self.weight_decay)

        if pretrained_actor_weights:
            actor_weights = torch.load(pretrained_actor_weights)
            self.actor_local.load_state_dict(actor_weights)
            self.actor_target.load_state_dict(actor_weights)

        if pretrained_critic_weights:
            critic_weights = torch.load(pretrained_critic_weights)
            self.critic_local.load_state_dict(critic_weights)
            self.critic_target.load_state_dict(critic_weights)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        if memory:
            self.memory = memory
        else:
            self.memory = ReplayBuffer(action_size, self.buffer_size,
                                       self.batch_size, random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device).unsqueeze(0)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 29
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        net_cfg = {
            "hidden1": args.hidden1,
            "hidden2": args.hidden2,
            "init_w": args.init_w,
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(
            self.actor_target, self.actor
        )  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(
            limit=args.rmsize, window_length=args.window_length
        )
        self.random_process = OrnsteinUhlenbeckProcess(
            size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma
        )

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        #
        if USE_CUDA:
            self.cuda()

    def update_policy(self):
        # Sample batch
        (
            state_batch,
            action_batch,
            reward_batch,
            next_state_batch,
            terminal_batch,
        ) = self.memory.sample_and_split(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target(
            [
                to_tensor(next_state_batch, volatile=True),
                self.actor_target(to_tensor(next_state_batch, volatile=True)),
            ]
        )
        # next_q_values.volatile = False

        target_q_batch = (
            to_tensor(reward_batch)
            + self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values
        )

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch), self.actor(to_tensor(state_batch))]
        )

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.0, 1.0, self.nb_actions)
        self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True):
        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
        action += self.is_training * max(self.epsilon, 0) * self.random_process.sample()
        action = np.clip(action, -1.0, 1.0)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None:
            return

        self.actor.load_state_dict(torch.load("{}/actor.pkl".format(output)))

        self.critic.load_state_dict(torch.load("{}/critic.pkl".format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), "{}/actor.pkl".format(output))
        torch.save(self.critic.state_dict(), "{}/critic.pkl".format(output))

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
Exemplo n.º 30
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args, discrete, use_cuda=False):
        
        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions
        self.discrete = discrete
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'init_w':args.init_w
        }
        self.actor = Actor(self.nb_states * args.window_length, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states * args.window_length, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states * args.window_length, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states * args.window_length, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = use_cuda
        # 
        if self.use_cuda: self.cuda()
        
    def update_policy(self, train_actor = True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # state_batch, action_batch, reward_batch, \
        # next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([ to_tensor(state_batch), to_tensor(action_batch) ])
        
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()

        policy_loss = -self.critic([
            to_tensor(state_batch),
            self.actor(to_tensor(state_batch))
        ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        if train_actor == True:
            self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        print("use cuda")
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=1):
        action = to_numpy(
            self.actor(to_tensor(np.array([s_t])))
        ).squeeze(0)
        # print(self.random_process.sample(), action)
        noise_level = noise_level * max(self.epsilon, 0)
        action = action * (1 - noise_level) + (self.random_process.sample() * noise_level)
        # print(max(self.epsilon, 0) * self.random_process.sample() * noise_level, noise_level)
        action = np.clip(action, -1., 1.)
        # print(action)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        if return_fix:
            return action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None: return

        self.actor.load_state_dict(
            torch.load('{}/actor.pkl'.format(output))
        )

        self.critic.load_state_dict(
            torch.load('{}/critic.pkl'.format(output))
        )


    def save_model(self, output):
        if self.use_cuda:
            self.actor.cpu()
            self.critic.cpu()
        torch.save(
            self.actor.state_dict(),
            '{}/actor.pkl'.format(output)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic.pkl'.format(output)
        )
        if self.use_cuda:
            self.actor.cuda()
            self.critic.cuda()

    def seed(self,s):
        torch.manual_seed(s)
        if self.use_cuda:
            torch.cuda.manual_seed(s)
Exemplo n.º 31
0
class Agent():
    def __init__(self, test=False):
        # device
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
        #########################################
        """
        Some hand tune config(for developing)
        """
        self.discrete = False
        self.action_dim = 1
        self.state_dim = 3
        self.batch_size = 100
        self.action_low = -2
        self.action_high = 2
        ##########################################
        self.P_online = Actor(state_dim=self.state_dim,
                              action_size=self.action_dim).to(self.device)
        self.P_target = Actor(state_dim=self.state_dim,
                              action_size=self.action_dim).to(self.device)
        self.P_target.load_state_dict(self.P_online.state_dict())
        self.Q_online = Critic(state_size=self.state_dim,
                               action_size=self.action_dim).to(self.device)
        self.Q_target = Critic(state_size=self.state_dim,
                               action_size=self.action_dim).to(self.device)
        self.Q_target.load_state_dict(self.Q_online.state_dict())
        # discounted reward
        self.gamma = 0.99
        self.eps = 0.25
        # optimizer
        self.q_optimizer = torch.optim.Adam(self.Q_online.parameters(),
                                            lr=1e-3)
        self.p_optimizer = torch.optim.Adam(self.P_online.parameters(),
                                            lr=1e-3)
        # saved rewards and actions
        self.replay_buffer = ReplayBuffer()

        # noise
        self.noise = Noise(DELTA, SIGMA, OU_A, OU_MU)
        # Initialize noise
        self.ou_level = 0.

        self.ep_step = 0

    def act(self, state, test=False):
        if not test:
            with torch.no_grad():
                # boring type casting
                state = ((torch.from_numpy(state)).unsqueeze(0)).float().to(
                    self.device)
                action = self.P_online(state)  # continuous output
                a = action.data.cpu().numpy()
                # if self.ep_step < 200:
                # self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level)
                # a = a + self.ou_level
                if self.discrete:
                    action = np.argmax(a)
                    return a, action
                else:
                    if self.ep_step < 200:
                        self.ou_level = self.noise.ornstein_uhlenbeck_level(
                            self.ou_level)
                    action = np.clip(a + self.ou_level, self.action_low,
                                     self.action_high)
                    return action, action

    def collect_data(self, state, action, reward, next_state, done):
        self.replay_buffer.push(
            torch.from_numpy(state).float().unsqueeze(0),
            torch.from_numpy(action).float(),
            torch.tensor([reward]).float().unsqueeze(0),
            torch.from_numpy(next_state).float().unsqueeze(0),
            torch.tensor([done]).float().unsqueeze(0))

    def clear_data(self):
        raise NotImplementedError("Circular Queue don't need this function")

    def update(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size=self.batch_size, device=self.device)
        # discounted rewards
        # rewards = torch.from_numpy(discount((rewards.view(rewards.shape[0])).cpu().numpy())).float().to(self.device)

        ### debug shape : ok
        #===============================Critic Update===============================
        self.Q_online.train()
        Q = self.Q_online((states, actions))

        with torch.no_grad():  # don't need backprop for target value
            self.Q_target.eval()
            self.P_target.eval()
            target = rewards + self.gamma * (1 - dones) * self.Q_target(
                (next_states, self.P_target(next_states)))
        critic_loss_fn = torch.nn.MSELoss()
        critic_loss = critic_loss_fn(Q, target).mean()
        # update
        self.q_optimizer.zero_grad()
        critic_loss.backward()
        self.q_optimizer.step()
        # print("critic loss", critic_loss.item())

        #===============================Actor Update===============================
        # fix online_critic , update online_actor
        self.Q_online.eval()
        for p in self.Q_online.parameters():
            p.requires_grad = False
        for p in self.P_online.parameters():
            p.requires_grad = True
        policy_loss = -self.Q_online((states, self.P_online(states)))
        policy_loss = policy_loss.mean()
        self.p_optimizer.zero_grad()
        policy_loss.backward()
        self.p_optimizer.step()
        # print("policy loss", policy_loss.item())
        for p in self.Q_online.parameters():
            p.requires_grad = True
        #===============================Target Update===============================
        soft_update(self.Q_target, self.Q_online, tau=1e-3)
        soft_update(self.P_target, self.P_online, tau=1e-3)
        self.eps -= EPSILON_DECAY
        if self.eps <= 0:
            self.eps = 0
Exemplo n.º 32
0
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, seed, memory, batch_size, lr_actor, lr_critic, clip_critic, gamma, tau, weight_decay, update_network_steps, sgd_epoch, checkpoint_prefix):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            memory (ReplayBuffer): The replay buffer for storing xperiences
            batch_size (int): Number of experiences to sample from the memory
            lr_actor (float): The learning rate for the actor
            lr_critic (float): The learning rate critic
            clip_critic (float): The clip value for updating grads
            gamma (float): The reward discount factor
            tau (float): For soft update of target parameters
            weight_decay (float): The weight decay
            update_network_steps (int): How often to update the network
            sgd_epoch (int): Number of iterations for each network update
            checkpoint_prefix (string): The string prefix for saving checkpoint files
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.memory = memory
        self.batch_size = batch_size
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.clip_critic = clip_critic
        self.gamma = gamma
        self.tau = tau
        self.weight_decay = weight_decay
        self.update_network_steps = update_network_steps
        self.sgd_epoch = sgd_epoch
        self.n_step = 0
        
        # checkpoint
        self.checkpoint_prefix = checkpoint_prefix
        self.actor_loss_episodes = []
        self.critic_loss_episodes = []
        self.actor_loss = 0
        self.critic_loss = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay)

        # Noise process
        self.noise = OUNoise(action_size, seed)

    def step(self, state, action, action_prob, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for i in range(len(state)):
            self.memory.add(state[i], action[i], action_prob[i], reward[i], next_state[i], done[i])
         
        # learn every n steps
        self.n_step = (self.n_step + 1) % self.update_network_steps
        if self.n_step == 0:
            # Learn, if enough samples are available in memory
            if len(self.memory) > self.batch_size:
                for i in range(self.sgd_epoch):
                    experiences = self.memory.sample()
                    self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1), np.zeros_like(action) # N/A action prob for DDPG

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, action_probs, rewards, next_states, dones = experiences

        # normalize rewards
        rewards = utils.normalize_rewards(rewards)
        
        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.critic_loss = critic_loss
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        if self.clip_critic > 0:
            torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), self.clip_critic)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        self.actor_loss = actor_loss
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def checkpoint(self):
        """Save internal information in memory for later checkpointing"""
        self.actor_loss_episodes.append(self.actor_loss)
        self.critic_loss_episodes.append(self.critic_loss)

    def save_checkpoint(self):
        """Persist checkpoint information"""
        # the history loss
        utils.plot_scores(self.checkpoint_prefix + "_actor_loss.png", self.actor_loss_episodes, label="loss")
        utils.plot_scores(self.checkpoint_prefix + "_critic_loss.png", self.critic_loss_episodes, label="loss")
        
        # network
        torch.save(self.actor_local.state_dict(), self.checkpoint_prefix + "_actor.pth")
        torch.save(self.critic_local.state_dict(), self.checkpoint_prefix + "_critic.pth")

    def load_checkpoint(self):
        """Restore checkpoint information"""
        self.actor_local.load_state_dict(torch.load(self.checkpoint_prefix + "_actor.pth"))
        self.critic_local.load_state_dict(torch.load(self.checkpoint_prefix + "_critic.pth"))
Exemplo n.º 33
0
class ddpg_Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, env, config):
        """Initialize an Agent object.
        
        Params
        ======
            env : environment to be handled
            config : configuration given a variety of parameters
        """
        self.env = env
        self.config = config

        # set parameter for ML
        self.set_parameters(config)
        # Q-Network
        self.create_networks()
        # Noise process
        self.noise = OUNoise(self.action_size, self.seed)
        # Replay memory
        self.memory = ReplayBuffer(self.action_size, self.buffer_size, self.batch_size, self.seed)
    
    def set_parameters(self, config):
        # Base agent parameters
        self.gamma = config['gamma']                    # discount factor 
        self.tau = config['tau']
        self.max_episodes = config['max_episodes']      # max numbers of episdoes to train
        self.env_file_name = config['env_file_name']    # name and path for env app
        self.brain_name = config['brain_name']          # name for env brain used in step
        self.num_agents = config['num_agents']
        self.state_size = config['state_size']
        self.action_size = config['action_size']
        self.hidden_size = config['hidden_size']
        self.buffer_size = config['buffer_size']
        self.batch_size = config['batch_size']
        self.dropout = config['dropout']
        self.critic_learning_rate = config['critic_learning_rate']
        self.actor_learning_rate = config['actor_learning_rate']
        self.seed = (config['seed'])
        self.noise_scale = 1
        self.noise_sigma = 0.1
        # Some debug flags
        self.DoDebugEpisodeLists = False        
        
    def create_networks(self):
        # Actor Network (local & Target Network)
        self.actor_local = Actor(self.state_size, self.hidden_size, self.action_size, self.seed).to(device)
        self.actor_target = Actor(self.state_size, self.hidden_size, self.action_size, self.seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.actor_learning_rate)

        # Critic Network (local & Target Network)
        self.critic_local = Critic(self.state_size, self.hidden_size, self.action_size, self.seed, self.dropout).to(device)
        self.critic_target = Critic(self.state_size, self.hidden_size, self.action_size, self.seed, self.dropout).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.critic_learning_rate)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        # print('step : Next States : ',next_state.shape)
        self.memory.add(state, action, reward, next_state, done)
        # print('New step added to memory, length : ',len(self.memory))

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)
            
    def update_noise_scale(self, cur_reward, scale_min = 0.2, scale_noise=False):
        """ If scale_noise == True the self.noise_scale will be decreased in relation to rewards
            Currently hand coded  as rewards go up noise_scale will go down from 1 to scale_min"""
        
        if scale_noise:
            rewlow = 2 # below rewlow noise_scale is 1 from there on it increases linearly down to scale_min + 0.5*(1 - scale_min) until rewhigh is reached
            rewhigh = 10 # above rewhigh noise_scale falls linearly down to scale_min until rewrd = 30 is reached. Beyond 30 it stays at scale_min
            if cur_reward > rewlow:
                if cur_reward < rewhigh:
                    self.noise_scale = (1 - scale_min)*(0.5*(rewhigh-cur_reward)/(rewhigh - rewlow) + 0.5) + scale_min
                else:
                    self.noise_scale = (1 - scale_min)*np.min(0.5*(30-cur_reward)/((30-rewhigh)),0) + scale_min
                    
            print('Updated noise scale to : ',self.noise_scale)
                
        return                    
        

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = ten(state)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise_scale * self.noise.sample()
            # ToDo check if tanh works better
        return np.clip(action, -1, 1)

    def train(self):
        if False:
            filename = 'trained_reacher_a_e100.pth'
            self.load_agent(filename)
        all_rewards = []
        reward_window = deque(maxlen=100)
        print('Running on device : ',device)
        for i_episode in range(self.max_episodes): 
            tic = time.time()
            # Reset the enviroment
            env_info = self.env.reset(train_mode=True)[self.brain_name]
            state = env_info.vector_observations
            total_reward = np.zeros(self.num_agents)
            t = 0
            done = np.zeros(self.num_agents, dtype = bool)

            # loop over episode time steps
            while all(done==False): #  t < self.tmax:
                # act and collect data
                action = self.act(state)
                env_info = self.env.step(action)[self.brain_name]
                next_state = env_info.vector_observations
                reward = np.asarray(env_info.rewards)
                done = np.asarray(env_info.local_done)
                # np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
                # print('Episode {} step {} taken action {} reward {} and done is {}'.format(i_episode,t,action,reward,done))
                # increment stuff
                t += 1
                total_reward += reward
                # Proceed agent step
                self.step(state, action, reward, next_state, done)
                # prepare for next round
                state = next_state
            # while not done
            # keep track of rewards:
            all_rewards.append(np.mean(total_reward))
            reward_window.append(np.mean(total_reward))
            
            # Output Episode info : 
            toc = time.time()
            if (i_episode == 100):
                self.stable_update()
            self.update_noise_scale(np.mean(reward_window))
            if not (i_episode % 25 == 0):
                print('Episode {} || Total Reward : {:6.3f} || average reward : {:6.3f} || Used {:5.3f} seconds, mem : {}'.format(i_episode,np.mean(total_reward),np.mean(reward_window),toc-tic,len(self.memory)))
            else:
                print(Back.RED + 'Episode {} || Total Reward : {:6.3f} || average reward : {:6.3f}'.format(i_episode,np.mean(total_reward),np.mean(reward_window)))
                print(Style.RESET_ALL)
                
            if (i_episode % 50 == 0):
                self.save_agent(i_episode)
        # for i_episode
            
        return all_rewards

    def reset(self):
        self.noise.reset()
        
    def stable_update(self):
        """ Update Hyperparameters which proved more stable """
        self.buffer_size = 400000
        self.memory.enlarge(self.buffer_size)
        self.noise_sigma = 0.05
        self.noise.sigma = 0.05

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            self.gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        # print('learn : Next States : ',next_states.shape)
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # print('learn : Actions : ',actions_next.shape)
        # print('learn : Q_target_next : ',Q_targets_next.shape)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def save_agent(self,i_episode):
        filename = 'trained_reacher_e'+str(i_episode)+'.pth'
        torch.save({
            'critic_local': self.critic_local.state_dict(),
            'critic_target': self.critic_target.state_dict(),
            'actor_local': self.actor_local.state_dict(),
            'actor_target': self.actor_target.state_dict(),
            }, filename)
        
        print('Saved Networks in ',filename)
        return
        
    def load_agent(self,filename):
        savedata = torch.load(filename)
        self.critic_local.load_state_dict(savedata['critic_local'])
        self.critic_target.load_state_dict(savedata['critic_target'])
        self.actor_local.load_state_dict(savedata['actor_local'])
        self.actor_target.load_state_dict(savedata['actor_target'])
        return
Exemplo n.º 34
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete
        self.pic = args.pic
        self.writer = writer
        self.select_time = 0
        if self.pic:
            self.nb_status = args.pic_status

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'use_bn': args.bn,
            'init_method': args.init_method
        }
        if args.pic:
            self.cnn = CNN(1, args.pic_status)
            self.cnn_target = CNN(1, args.pic_status)
            self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        if args.pic:
            hard_update(self.cnn_target, self.cnn)

        #Create replay buffer
        self.memory = rpm(
            args.rmsize
        )  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.use_cuda = args.cuda
        #
        if self.use_cuda: self.cuda()

    def normalize(self, pic):
        pic = pic.swapaxes(0, 2).swapaxes(1, 2)
        return pic

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        if self.pic:
            state_batch = np.array([self.normalize(x) for x in state_batch])
            state_batch = to_tensor(state_batch, volatile=True)
            state_batch = self.cnn(state_batch)
            next_state_batch = np.array(
                [self.normalize(x) for x in next_state_batch])
            next_state_batch = to_tensor(next_state_batch, volatile=True)
            next_state_batch = self.cnn_target(next_state_batch)
            next_q_values = self.critic_target(
                [next_state_batch,
                 self.actor_target(next_state_batch)])
        else:
            next_q_values = self.critic_target([
                to_tensor(next_state_batch, volatile=True),
                self.actor_target(to_tensor(next_state_batch, volatile=True)),
            ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            q_batch = self.critic([state_batch, to_tensor(action_batch)])
        else:
            q_batch = self.critic(
                [to_tensor(state_batch),
                 to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()
        if self.pic: self.cnn_optim.step()

        self.actor.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            policy_loss = -self.critic([state_batch, self.actor(state_batch)])
        else:
            policy_loss = -self.critic(
                [to_tensor(state_batch),
                 self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(),
                                          float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(
                    np.mean([
                        np.linalg.norm(p.grad.data.cpu().numpy().ravel())
                        for p in self.actor.parameters()
                    ]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad',
                                       mean_policy_grad, self.select_time)

        self.actor_optim.step()
        if self.pic: self.cnn_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        if self.pic:
            soft_update(self.cnn_target, self.cnn, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()
        if (self.pic):
            self.cnn.eval()
            self.cnn_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()
        if (self.pic):
            self.cnn.train()
            self.cnn_target.train()

    def cuda(self):
        self.cnn.cuda()
        self.cnn_target.cuda()
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self, fix=False):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        if self.discrete and fix == False:
            action = action.argmax()
        if self.pic:
            action = np.concatenate(
                (softmax(action[:16]), softmax(action[16:])))
        return action

    def select_action(self,
                      s_t,
                      decay_epsilon=True,
                      return_fix=False,
                      noise_level=0):
        self.eval()
        if self.pic:
            s_t = self.normalize(s_t)
            s_t = self.cnn(to_tensor(np.array([s_t])))
        if self.pic:
            action = to_numpy(self.actor_target(s_t)).squeeze(0)
        else:
            action = to_numpy(self.actor(to_tensor(np.array([s_t
                                                             ])))).squeeze(0)
        self.train()
        noise_level = noise_level * max(self.epsilon, 0)

        if np.random.uniform(0, 1) < noise_level:
            action = (action +
                      self.random_action(fix=True)) / 2.  # episilon greedy

        if decay_epsilon:
            self.epsilon -= self.depsilon
        self.a_t = action

        if return_fix:
            return action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num)))
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num)))
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num)))
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num)))

    def save_model(self, output, num):
        if self.use_cuda:
            self.cnn.cpu()
            self.actor.cpu()
            self.critic.cpu()
        torch.save(self.actor.state_dict(),
                   '{}/actor{}.pkl'.format(output, num))
        torch.save(self.critic.state_dict(),
                   '{}/critic{}.pkl'.format(output, num))
        if self.use_cuda:
            self.cnn.cuda()
            self.actor.cuda()
            self.critic.cuda()
Exemplo n.º 35
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate)

    writer = SummaryWriter(comment="-ppo_iter-" + str(args.max_iter_num))
    
    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0    

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size: 
            state = env.reset()
            score = 0

            state = running_state(state)
            
            for _ in range(10000): 
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train()
        train_model(actor, critic, memory, actor_optim, critic_optim, args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'z_filter_n':running_state.rs.n,
                'z_filter_m': running_state.rs.mean,
                'z_filter_s': running_state.rs.sum_square,
                'args': args,
                'score': score_avg
            }, filename=ckpt_path)