Пример #1
0
    def __init__(self, writer, device, state_dim, action_dim, args, noise):
        super(DDPG, self).__init__()
        self.device = device
        self.writer = writer

        self.args = args
        self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function, self.args.last_activation, self.args.trainable_std)

        self.target_actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function, self.args.last_activation, self.args.trainable_std)

        self.q = Critic(self.args.layer_num, state_dim + action_dim, 1,
                        self.args.hidden_dim, self.args.activation_function,
                        None)

        self.target_q = Critic(self.args.layer_num, state_dim + action_dim, 1,
                               self.args.hidden_dim,
                               self.args.activation_function, None)

        self.soft_update(self.q, self.target_q, 1.)
        self.soft_update(self.actor, self.target_actor, 1.)

        self.q_optimizer = optim.Adam(self.q.parameters(), lr=self.args.q_lr)

        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.args.actor_lr)
        self.data = ReplayBuffer(action_prob_exist=False,
                                 max_size=int(self.args.memory_size),
                                 state_dim=state_dim,
                                 num_action=action_dim)

        self.noise = noise
Пример #2
0
    def __init__(self, writer, device, state_dim, action_dim, args):
        super(PPO,self).__init__()
        self.args = args
        
        self.data = ReplayBuffer(action_prob_exist = True, max_size = self.args.traj_length, state_dim = state_dim, num_action = action_dim)
        self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function,self.args.last_activation,self.args.trainable_std)
        self.critic = Critic(self.args.layer_num, state_dim, 1, \
                             self.args.hidden_dim, self.args.activation_function,self.args.last_activation)
        
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.args.critic_lr)

        self.writer = writer
        self.device = device
Пример #3
0
    def __init__(self, writer, device, state_dim, action_dim, args):
        super(SAC, self).__init__()
        self.args = args
        self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function, self.args.last_activation, self.args.trainable_std)

        self.q_1 = Critic(self.args.layer_num, state_dim + action_dim, 1,
                          self.args.hidden_dim, self.args.activation_function,
                          self.args.last_activation)
        self.q_2 = Critic(self.args.layer_num, state_dim + action_dim, 1,
                          self.args.hidden_dim, self.args.activation_function,
                          self.args.last_activation)

        self.target_q_1 = Critic(self.args.layer_num, state_dim + action_dim,
                                 1, self.args.hidden_dim,
                                 self.args.activation_function,
                                 self.args.last_activation)
        self.target_q_2 = Critic(self.args.layer_num, state_dim + action_dim,
                                 1, self.args.hidden_dim,
                                 self.args.activation_function,
                                 self.args.last_activation)

        self.soft_update(self.q_1, self.target_q_1, 1.)
        self.soft_update(self.q_2, self.target_q_2, 1.)

        self.alpha = nn.Parameter(torch.tensor(self.args.alpha_init))

        self.data = ReplayBuffer(action_prob_exist=False,
                                 max_size=int(self.args.memory_size),
                                 state_dim=state_dim,
                                 num_action=action_dim)
        self.target_entropy = -torch.tensor(action_dim)

        self.q_1_optimizer = optim.Adam(self.q_1.parameters(),
                                        lr=self.args.q_lr)
        self.q_2_optimizer = optim.Adam(self.q_2.parameters(),
                                        lr=self.args.q_lr)

        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.args.actor_lr)
        self.alpha_optimizer = optim.Adam([self.alpha], lr=self.args.alpha_lr)

        self.device = device
        self.writer = writer
Пример #4
0
class SAC(nn.Module):
    def __init__(self, writer, device, state_dim, action_dim, args):
        super(SAC, self).__init__()
        self.args = args
        self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function, self.args.last_activation, self.args.trainable_std)

        self.q_1 = Critic(self.args.layer_num, state_dim + action_dim, 1,
                          self.args.hidden_dim, self.args.activation_function,
                          self.args.last_activation)
        self.q_2 = Critic(self.args.layer_num, state_dim + action_dim, 1,
                          self.args.hidden_dim, self.args.activation_function,
                          self.args.last_activation)

        self.target_q_1 = Critic(self.args.layer_num, state_dim + action_dim,
                                 1, self.args.hidden_dim,
                                 self.args.activation_function,
                                 self.args.last_activation)
        self.target_q_2 = Critic(self.args.layer_num, state_dim + action_dim,
                                 1, self.args.hidden_dim,
                                 self.args.activation_function,
                                 self.args.last_activation)

        self.soft_update(self.q_1, self.target_q_1, 1.)
        self.soft_update(self.q_2, self.target_q_2, 1.)

        self.alpha = nn.Parameter(torch.tensor(self.args.alpha_init))

        self.data = ReplayBuffer(action_prob_exist=False,
                                 max_size=int(self.args.memory_size),
                                 state_dim=state_dim,
                                 num_action=action_dim)
        self.target_entropy = -torch.tensor(action_dim)

        self.q_1_optimizer = optim.Adam(self.q_1.parameters(),
                                        lr=self.args.q_lr)
        self.q_2_optimizer = optim.Adam(self.q_2.parameters(),
                                        lr=self.args.q_lr)

        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.args.actor_lr)
        self.alpha_optimizer = optim.Adam([self.alpha], lr=self.args.alpha_lr)

        self.device = device
        self.writer = writer

    def put_data(self, transition):
        self.data.put_data(transition)

    def soft_update(self, network, target_network, rate):
        for network_params, target_network_params in zip(
                network.parameters(), target_network.parameters()):
            target_network_params.data.copy_(target_network_params.data *
                                             (1.0 - rate) +
                                             network_params.data * rate)

    def get_action(self, state):
        mu, std = self.actor(state)
        dist = Normal(mu, std)
        u = dist.rsample()
        u_log_prob = dist.log_prob(u)
        a = torch.tanh(u)
        a_log_prob = u_log_prob - torch.log(1 - torch.square(a) + 1e-3)
        return a, a_log_prob.sum(-1, keepdim=True)

    def q_update(self, Q, q_optimizer, states, actions, rewards, next_states,
                 dones):
        ###target
        with torch.no_grad():
            next_actions, next_action_log_prob = self.get_action(next_states)
            q_1 = self.target_q_1(next_states, next_actions)
            q_2 = self.target_q_2(next_states, next_actions)
            q = torch.min(q_1, q_2)
            v = (1 - dones) * (q - self.alpha * next_action_log_prob)
            targets = rewards + self.args.gamma * v

        q = Q(states, actions)
        loss = F.smooth_l1_loss(q, targets)
        q_optimizer.zero_grad()
        loss.backward()
        q_optimizer.step()
        return loss

    def actor_update(self, states):
        now_actions, now_action_log_prob = self.get_action(states)
        q_1 = self.q_1(states, now_actions)
        q_2 = self.q_2(states, now_actions)
        q = torch.min(q_1, q_2)

        loss = (self.alpha.detach() * now_action_log_prob - q).mean()
        self.actor_optimizer.zero_grad()
        loss.backward()
        self.actor_optimizer.step()
        return loss, now_action_log_prob

    def alpha_update(self, now_action_log_prob):
        loss = (-self.alpha *
                (now_action_log_prob + self.target_entropy).detach()).mean()
        self.alpha_optimizer.zero_grad()
        loss.backward()
        self.alpha_optimizer.step()
        return loss

    def train_net(self, batch_size, n_epi):
        data = self.data.sample(shuffle=True, batch_size=batch_size)
        states, actions, rewards, next_states, dones = convert_to_tensor(
            self.device, data['state'], data['action'], data['reward'],
            data['next_state'], data['done'])

        ###q update
        q_1_loss = self.q_update(self.q_1, self.q_1_optimizer, states, actions,
                                 rewards, next_states, dones)
        q_2_loss = self.q_update(self.q_2, self.q_2_optimizer, states, actions,
                                 rewards, next_states, dones)

        ### actor update
        actor_loss, prob = self.actor_update(states)

        ###alpha update
        alpha_loss = self.alpha_update(prob)

        self.soft_update(self.q_1, self.target_q_1, self.args.soft_update_rate)
        self.soft_update(self.q_2, self.target_q_2, self.args.soft_update_rate)
        if self.writer != None:
            self.writer.add_scalar("loss/q_1", q_1_loss, n_epi)
            self.writer.add_scalar("loss/q_2", q_2_loss, n_epi)
            self.writer.add_scalar("loss/actor", actor_loss, n_epi)
            self.writer.add_scalar("loss/alpha", alpha_loss, n_epi)
Пример #5
0
def train_wasserstein(config):
    # extractor = Extractor(n_flattens=config['n_flattens'], n_hiddens=config['n_hiddens'])
    extractor = InceptionV1(num_classes=32)
    classifier = Classifier(n_flattens=config['n_flattens'],
                            n_hiddens=config['n_hiddens'],
                            n_class=config['n_class'])
    critic = Critic(n_flattens=config['n_flattens'],
                    n_hiddens=config['n_hiddens'])
    if torch.cuda.is_available():
        extractor = extractor.cuda()
        classifier = classifier.cuda()
        critic = critic.cuda()

    triplet_type = config['triplet_type']
    gamma = config['w_gamma']
    weight_wd = config['w_weight']
    weight_triplet = config['t_weight']
    t_margin = config['t_margin']
    t_confidence = config['t_confidence']
    k_critic = 3
    k_clf = 1
    TRIPLET_START_INDEX = 95

    if triplet_type == 'none':
        res_dir = os.path.join(
            config['res_dir'],
            'bs{}-lr{}-w{}-gamma{}'.format(config['batch_size'], config['lr'],
                                           weight_wd, gamma))
        if not os.path.exists(res_dir):
            os.makedirs(res_dir)
        extractor_path = os.path.join(res_dir, "extractor.pth")
        classifier_path = os.path.join(res_dir, "classifier.pth")
        critic_path = os.path.join(res_dir, "critic.pth")
        EPOCH_START = 1
        TEST_INTERVAL = 10

    else:
        TEST_INTERVAL = 1
        w_dir = os.path.join(
            config['res_dir'],
            'bs{}-lr{}-w{}-gamma{}'.format(config['batch_size'], config['lr'],
                                           weight_wd, gamma))
        if not os.path.exists(w_dir):
            os.makedirs(w_dir)
        res_dir = os.path.join(
            w_dir, '{}_t_weight{}_margin{}_confidence{}'.format(
                triplet_type, weight_triplet, t_margin, t_confidence))
        if not os.path.exists(res_dir):
            os.makedirs(res_dir)
        extractor_path = os.path.join(w_dir, "extractor.pth")
        classifier_path = os.path.join(w_dir, "classifier.pth")
        critic_path = os.path.join(w_dir, "critic.pth")

        if os.path.exists(extractor_path):
            extractor.load_state_dict(torch.load(extractor_path))
            classifier.load_state_dict(torch.load(classifier_path))
            critic.load_state_dict(torch.load(critic_path))
            print('load models')
            EPOCH_START = TRIPLET_START_INDEX
        else:
            EPOCH_START = 1

    set_log_config(res_dir)
    print('start epoch {}'.format(EPOCH_START))
    print('triplet type {}'.format(triplet_type))
    print(config)

    logging.debug('train_wt')
    logging.debug(extractor)
    logging.debug(classifier)
    logging.debug(critic)
    logging.debug(config)

    criterion = torch.nn.CrossEntropyLoss()
    softmax_layer = nn.Softmax(dim=1)

    critic_opt = torch.optim.Adam(critic.parameters(), lr=config['lr'])
    classifier_opt = torch.optim.Adam(classifier.parameters(), lr=config['lr'])
    feature_opt = torch.optim.Adam(extractor.parameters(),
                                   lr=config['lr'] / 10)

    def train(extractor, classifier, critic, config, epoch):
        extractor.train()
        classifier.train()
        critic.train()

        iter_source = iter(config['source_train_loader'])
        iter_target = iter(config['target_train_loader'])
        len_source_loader = len(config['source_train_loader'])
        len_target_loader = len(config['target_train_loader'])
        num_iter = len_source_loader
        for step in range(1, num_iter):
            data_source, label_source = iter_source.next()
            data_target, _ = iter_target.next()
            if step % len_target_loader == 0:
                iter_target = iter(config['target_train_loader'])
            if torch.cuda.is_available():
                data_source, label_source = data_source.cuda(
                ), label_source.cuda()
                data_target = data_target.cuda()

            # 1. train critic
            set_requires_grad(extractor, requires_grad=False)
            set_requires_grad(classifier, requires_grad=False)
            set_requires_grad(critic, requires_grad=True)
            with torch.no_grad():
                h_s = extractor(data_source)
                h_s = h_s.view(h_s.size(0), -1)
                h_t = extractor(data_target)
                h_t = h_t.view(h_t.size(0), -1)

            for j in range(k_critic):
                gp = gradient_penalty(critic, h_s, h_t)
                critic_s = critic(h_s)
                critic_t = critic(h_t)
                wasserstein_distance = critic_s.mean() - critic_t.mean()
                critic_cost = -wasserstein_distance + gamma * gp

                critic_opt.zero_grad()
                critic_cost.backward()
                critic_opt.step()

                if step == 10 and j == 0:
                    print('EPOCH {}, DISCRIMINATOR: wd {}, gp {}, loss {}'.
                          format(epoch, wasserstein_distance.item(),
                                 (gamma * gp).item(), critic_cost.item()))
                    logging.debug(
                        'EPOCH {}, DISCRIMINATOR: wd {}, gp {}, loss {}'.
                        format(epoch, wasserstein_distance.item(),
                               (gamma * gp).item(), critic_cost.item()))

            # 2. train feature and class_classifier
            set_requires_grad(extractor, requires_grad=True)
            set_requires_grad(classifier, requires_grad=True)
            set_requires_grad(critic, requires_grad=False)
            for _ in range(k_clf):
                h_s = extractor(data_source)
                h_s = h_s.view(h_s.size(0), -1)
                h_t = extractor(data_target)
                h_t = h_t.view(h_t.size(0), -1)

                source_preds = classifier(h_s)
                clf_loss = criterion(source_preds, label_source)
                wasserstein_distance = critic(h_s).mean() - critic(h_t).mean()

                if triplet_type != 'none' and epoch >= TRIPLET_START_INDEX:
                    target_preds = classifier(h_t)
                    target_labels = target_preds.data.max(1)[1]
                    target_logits = softmax_layer(target_preds)
                    if triplet_type == 'all':
                        triplet_index = np.where(
                            target_logits.data.max(1)[0].cpu().numpy() >
                            t_margin)[0]
                        images = torch.cat((h_s, h_t[triplet_index]), 0)
                        labels = torch.cat(
                            (label_source, target_labels[triplet_index]), 0)
                    elif triplet_type == 'src':
                        images = h_s
                        labels = label_source
                    elif triplet_type == 'tgt':
                        triplet_index = np.where(
                            target_logits.data.max(1)[0].cpu().numpy() >
                            t_confidence)[0]
                        images = h_t[triplet_index]
                        labels = target_labels[triplet_index]
                    elif triplet_type == 'sep':
                        triplet_index = np.where(
                            target_logits.data.max(1)[0].cpu().numpy() >
                            t_confidence)[0]
                        images = h_t[triplet_index]
                        labels = target_labels[triplet_index]
                        t_loss_sep, _ = triplet_loss(extractor, {
                            "X": images,
                            "y": labels
                        }, t_confidence)
                        images = h_s
                        labels = label_source

                    t_loss, _ = triplet_loss(extractor, {
                        "X": images,
                        "y": labels
                    }, t_margin)
                    loss = clf_loss + \
                        weight_wd * wasserstein_distance + \
                        weight_triplet * t_loss
                    if triplet_type == 'sep':
                        loss += t_loss_sep
                    feature_opt.zero_grad()
                    classifier_opt.zero_grad()
                    loss.backward()
                    feature_opt.step()
                    classifier_opt.step()

                    if step == 10:
                        print(
                            'EPOCH {}, CLASSIFIER: clf_loss {}, wd {}, t_loss {}, total loss {}'
                            .format(epoch, clf_loss.item(),
                                    weight_wd * wasserstein_distance.item(),
                                    weight_triplet * t_loss.item(),
                                    loss.item()))
                        logging.debug(
                            'EPOCH {}, CLASSIFIER: clf_loss {}, wd {}, t_loss {}, total loss {}'
                            .format(epoch, clf_loss.item(),
                                    weight_wd * wasserstein_distance.item(),
                                    weight_triplet * t_loss.item(),
                                    loss.item()))

                else:
                    loss = clf_loss + weight_wd * wasserstein_distance
                    feature_opt.zero_grad()
                    classifier_opt.zero_grad()
                    loss.backward()
                    feature_opt.step()
                    classifier_opt.step()

                    if step == 10:
                        print(
                            'EPOCH {}, CLASSIFIER: clf_loss {}, wd {},  loss {}'
                            .format(epoch, clf_loss.item(),
                                    weight_wd * wasserstein_distance.item(),
                                    loss.item()))
                        logging.debug(
                            'EPOCH {}, CLASSIFIER: clf_loss {}, wd {},  loss {}'
                            .format(epoch, clf_loss.item(),
                                    weight_wd * wasserstein_distance.item(),
                                    loss.item()))

    # pretrain(model, config, pretrain_epochs=20)
    for epoch in range(EPOCH_START, config['n_epochs'] + 1):
        train(extractor, classifier, critic, config, epoch)
        if epoch % TEST_INTERVAL == 0:
            # print('test on source_test_loader')
            # test(extractor, classifier, config['source_test_loader'], epoch)
            # print('test on target_train_loader')
            # test(model, config['target_train_loader'], epoch)
            print('test on target_test_loader')
            test(extractor, classifier, config['target_test_loader'], epoch)
        if epoch % config['VIS_INTERVAL'] == 0:
            if triplet_type == 'none':
                title = '(a) WDGRL'
            else:
                title = '(b) TLADA'
            draw_confusion_matrix(extractor, classifier,
                                  config['target_test_loader'], res_dir, epoch,
                                  title)
            draw_tsne(extractor,
                      classifier,
                      config['source_train_loader'],
                      config['target_test_loader'],
                      res_dir,
                      epoch,
                      title,
                      separate=True)
            # draw_tsne(extractor, classifier, config['source_test_loader'], config['target_test_loader'], res_dir, epoch, title, separate=False)
    if triplet_type == 'none':
        torch.save(extractor.state_dict(), extractor_path)
        torch.save(classifier.state_dict(), classifier_path)
        torch.save(critic.state_dict(), critic_path)
Пример #6
0
class DDPG(nn.Module):
    def __init__(self, writer, device, state_dim, action_dim, args, noise):
        super(DDPG, self).__init__()
        self.device = device
        self.writer = writer

        self.args = args
        self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function, self.args.last_activation, self.args.trainable_std)

        self.target_actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function, self.args.last_activation, self.args.trainable_std)

        self.q = Critic(self.args.layer_num, state_dim + action_dim, 1,
                        self.args.hidden_dim, self.args.activation_function,
                        None)

        self.target_q = Critic(self.args.layer_num, state_dim + action_dim, 1,
                               self.args.hidden_dim,
                               self.args.activation_function, None)

        self.soft_update(self.q, self.target_q, 1.)
        self.soft_update(self.actor, self.target_actor, 1.)

        self.q_optimizer = optim.Adam(self.q.parameters(), lr=self.args.q_lr)

        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.args.actor_lr)
        self.data = ReplayBuffer(action_prob_exist=False,
                                 max_size=int(self.args.memory_size),
                                 state_dim=state_dim,
                                 num_action=action_dim)

        self.noise = noise

    def soft_update(self, network, target_network, rate):
        for network_params, target_network_params in zip(
                network.parameters(), target_network.parameters()):
            target_network_params.data.copy_(target_network_params.data *
                                             (1.0 - rate) +
                                             network_params.data * rate)

    def get_action(self, x):
        return self.actor(x)[0] + torch.tensor(self.noise.sample()).to(
            self.device), self.actor(x)[1]

    def put_data(self, transition):
        self.data.put_data(transition)

    def train_net(self, batch_size, n_epi):
        data = self.data.sample(shuffle=True, batch_size=batch_size)
        states, actions, rewards, next_states, dones = convert_to_tensor(
            self.device, data['state'], data['action'], data['reward'],
            data['next_state'], data['done'])

        targets = rewards + self.args.gamma * (1 - dones) * self.target_q(
            next_states,
            self.target_actor(next_states)[0])
        q_loss = F.smooth_l1_loss(self.q(states, actions), targets.detach())
        self.q_optimizer.zero_grad()
        q_loss.backward()
        self.q_optimizer.step()

        actor_loss = -self.q(states, self.actor(states)[0]).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.soft_update(self.q, self.target_q, self.args.soft_update_rate)
        self.soft_update(self.actor, self.target_actor,
                         self.args.soft_update_rate)
        if self.writer != None:
            self.writer.add_scalar("loss/q", q_loss, n_epi)
            self.writer.add_scalar("loss/actor", actor_loss, n_epi)
Пример #7
0
class PPO(nn.Module):
    def __init__(self, writer, device, state_dim, action_dim, args):
        super(PPO,self).__init__()
        self.args = args
        
        self.data = ReplayBuffer(action_prob_exist = True, max_size = self.args.traj_length, state_dim = state_dim, num_action = action_dim)
        self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function,self.args.last_activation,self.args.trainable_std)
        self.critic = Critic(self.args.layer_num, state_dim, 1, \
                             self.args.hidden_dim, self.args.activation_function,self.args.last_activation)
        
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.args.critic_lr)

        self.writer = writer
        self.device = device
        
    def get_action(self,x):
        mu,sigma = self.actor(x)
        return mu,sigma
    
    def v(self,x):
        return self.critic(x)
    
    def put_data(self,transition):
        self.data.put_data(transition)
        
    def get_gae(self, states, rewards, next_states, dones):
        values = self.v(states).detach()
        td_target = rewards + self.args.gamma * self.v(next_states) * (1 - dones)
        delta = td_target - values
        delta = delta.detach().cpu().numpy()
        advantage_lst = []
        advantage = 0.0
        for idx in reversed(range(len(delta))):
            if dones[idx] == 1:
                advantage = 0.0
            advantage = self.args.gamma * self.args.lambda_ * advantage + delta[idx][0]
            advantage_lst.append([advantage])
        advantage_lst.reverse()
        advantages = torch.tensor(advantage_lst, dtype=torch.float).to(self.device)
        return values, advantages
    
    def train_net(self,n_epi):
        data = self.data.sample(shuffle = False)
        states, actions, rewards, next_states, dones, old_log_probs = convert_to_tensor(self.device, data['state'], data['action'], data['reward'], data['next_state'], data['done'], data['log_prob'])
        
        old_values, advantages = self.get_gae(states, rewards, next_states, dones)
        returns = advantages + old_values
        advantages = (advantages - advantages.mean())/(advantages.std()+1e-3)
        
        for i in range(self.args.train_epoch):
            for state,action,old_log_prob,advantage,return_,old_value \
            in make_mini_batch(self.args.batch_size, states, actions, \
                                           old_log_probs,advantages,returns,old_values): 
                curr_mu,curr_sigma = self.get_action(state)
                value = self.v(state).float()
                curr_dist = torch.distributions.Normal(curr_mu,curr_sigma)
                entropy = curr_dist.entropy() * self.args.entropy_coef
                curr_log_prob = curr_dist.log_prob(action).sum(1,keepdim = True)

                #policy clipping
                ratio = torch.exp(curr_log_prob - old_log_prob.detach())
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1-self.args.max_clip, 1+self.args.max_clip) * advantage
                actor_loss = (-torch.min(surr1, surr2) - entropy).mean() 
                
                #value clipping (PPO2 technic)
                old_value_clipped = old_value + (value - old_value).clamp(-self.args.max_clip,self.args.max_clip)
                value_loss = (value - return_.detach().float()).pow(2)
                value_loss_clipped = (old_value_clipped - return_.detach().float()).pow(2)
                critic_loss = 0.5 * self.args.critic_coef * torch.max(value_loss,value_loss_clipped).mean()
                
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                nn.utils.clip_grad_norm_(self.actor.parameters(), self.args.max_grad_norm)
                self.actor_optimizer.step()
                
                self.critic_optimizer.zero_grad()
                critic_loss.backward()
                nn.utils.clip_grad_norm_(self.critic.parameters(), self.args.max_grad_norm)
                self.critic_optimizer.step()
                
                if self.writer != None:
                    self.writer.add_scalar("loss/actor_loss", actor_loss.item(), n_epi)
                    self.writer.add_scalar("loss/critic_loss", critic_loss.item(), n_epi)
Пример #8
0
class DDPGAgent():
    """ DDPG
    This class implements the DDP algorithm.
    For more information see: https://spinningup.openai.com/en/latest/algorithms/ddpg.html
    """
    def __init__(self,
                 state_size,
                 action_size,
                 fc_layer_sizes,
                 buffer_size=30000,
                 batch_size=128,
                 update_interval=16,
                 num_update_steps=1,
                 noise_std=0.2,
                 noise_reduction=0.998,
                 noise_std_min=0.05,
                 warmup=1e4,
                 tau=0.02,
                 gamma=0.99,
                 lr_actor=2e-4,
                 lr_critic=2e-4,
                 seed=0):
        """ Initialize an DDPG agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            fc_layer_sizes (list of int): Layer size of each FC layer
            buffer_size (int): the size of the replay buffer
            batch_size (int): the size of the batches for network updates
            update_interval (int): number of steps between updates
            num_update_steps (int): number of update steps in a row
            noise_std (float): std of Gaussian noise for adding to action
            noise_reduction (float): factor to reduce noise after each update
            noise_std_min (float): the minimum value of noise_std
            tau (float): soft weight update factor
            gamma (float): discount factor
            lr_actor (float): learning rate for actor
            lr_critic (float): learning rate for critic
            seed (int): random seed
        """
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.update_interval = update_interval
        self.num_update_steps = num_update_steps
        self.tau = tau
        self.gamma = gamma
        self.noise_std = noise_std
        self.noise_reduction = noise_reduction
        self.noise_std_min = noise_std_min
        self.warmup = warmup
        self.t = 0

        # seed
        np.random.seed(seed)

        # torch device
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # add replay buffer
        self.replay_buffer = ReplayBuffer(buffer_size, self.device, seed)

        # define networks, initialize target networks with original networks
        self.actor = Actor(state_size, action_size, fc_layer_sizes,
                           seed=seed).to(self.device)
        self.target_actor = Actor(state_size,
                                  action_size,
                                  fc_layer_sizes,
                                  seed=seed).to(self.device)
        self.critic = Critic(state_size,
                             action_size,
                             fc_layer_sizes,
                             seed=seed).to(self.device)
        self.target_critic = Critic(state_size,
                                    action_size,
                                    fc_layer_sizes,
                                    seed=seed).to(self.device)
        self.hard_updates()

        # define optimizers
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=lr_critic,
                                           weight_decay=0)

    def act(self, state, add_noise=True):
        """ Computes and returns the action to take

        Params
        ======
            state (list of float): current state
        """
        # input state to actor network in eval mode, get action, add Gaussian noise
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.actor.eval()
        with torch.no_grad():
            action = self.actor(state).squeeze().cpu().detach().numpy()
        self.actor.train()
        if add_noise:
            action += self.noise_std * np.random.normal(size=self.action_size)
        return action

    def step(self, state, action, reward, next_state, done):
        """ Saves step details and potentially performs network training

        Params
        ======
            state (list of float): current state
            action (list of float): action taken
            reward (float): reward received
            next_state (list of float):  next state
            done (bool): bool whether end of episode reached
        """
        self.replay_buffer.add(state, action, reward, next_state, done)
        self.t += 1
        if self.t >= self.warmup:
            if self.t % self.update_interval == 0:
                if (len(self.replay_buffer) > self.batch_size):
                    self.learn()

    def learn(self):
        """ Performs actor and critic network training """
        for _ in range(self.num_update_steps):
            # sample a random batch of experiences
            states, actions, rewards, next_states, dones = self.replay_buffer.sample(
                self.batch_size)

            # compute Q targets
            actions_next = self.target_actor(next_states)
            q_targets = rewards + self.gamma * \
                (1 - dones) * self.target_critic(next_states, actions_next)
            q_expected = self.critic(states, actions)

            # compute critic loss, update critic
            critic_loss = F.mse_loss(q_expected, q_targets)
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.critic.parameters(),
                                           .5)  # clip gradients
            self.critic_optimizer.step()

            # update actor
            actions_pred = self.actor(states)
            actor_loss = -self.critic(states, actions_pred).mean()
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # update target networks
            self.soft_updates()

        # reduce action sampling noise
        self.noise_std = max(self.noise_std * self.noise_reduction,
                             self.noise_std_min)

    def soft_updates(self):
        """ Performs a soft parameter update for target and original networks """
        for target, source in zip([self.target_actor, self.target_critic],
                                  [self.actor, self.critic]):
            for target_param, param in zip(target.parameters(),
                                           source.parameters()):
                target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                        param.data * self.tau)

    def hard_updates(self):
        """ Performs a hard parameter update for target and original networks """
        for target, source in zip([self.target_actor, self.target_critic],
                                  [self.actor, self.critic]):
            for target_param, param in zip(target.parameters(),
                                           source.parameters()):
                target_param.data.copy_(param.data)
Пример #9
0
    def __init__(self,
                 state_size,
                 action_size,
                 fc_layer_sizes,
                 buffer_size=30000,
                 batch_size=128,
                 update_interval=16,
                 num_update_steps=1,
                 noise_std=0.2,
                 noise_reduction=0.998,
                 noise_std_min=0.05,
                 warmup=1e4,
                 tau=0.02,
                 gamma=0.99,
                 lr_actor=2e-4,
                 lr_critic=2e-4,
                 seed=0):
        """ Initialize an DDPG agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            fc_layer_sizes (list of int): Layer size of each FC layer
            buffer_size (int): the size of the replay buffer
            batch_size (int): the size of the batches for network updates
            update_interval (int): number of steps between updates
            num_update_steps (int): number of update steps in a row
            noise_std (float): std of Gaussian noise for adding to action
            noise_reduction (float): factor to reduce noise after each update
            noise_std_min (float): the minimum value of noise_std
            tau (float): soft weight update factor
            gamma (float): discount factor
            lr_actor (float): learning rate for actor
            lr_critic (float): learning rate for critic
            seed (int): random seed
        """
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.update_interval = update_interval
        self.num_update_steps = num_update_steps
        self.tau = tau
        self.gamma = gamma
        self.noise_std = noise_std
        self.noise_reduction = noise_reduction
        self.noise_std_min = noise_std_min
        self.warmup = warmup
        self.t = 0

        # seed
        np.random.seed(seed)

        # torch device
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # add replay buffer
        self.replay_buffer = ReplayBuffer(buffer_size, self.device, seed)

        # define networks, initialize target networks with original networks
        self.actor = Actor(state_size, action_size, fc_layer_sizes,
                           seed=seed).to(self.device)
        self.target_actor = Actor(state_size,
                                  action_size,
                                  fc_layer_sizes,
                                  seed=seed).to(self.device)
        self.critic = Critic(state_size,
                             action_size,
                             fc_layer_sizes,
                             seed=seed).to(self.device)
        self.target_critic = Critic(state_size,
                                    action_size,
                                    fc_layer_sizes,
                                    seed=seed).to(self.device)
        self.hard_updates()

        # define optimizers
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=lr_critic,
                                           weight_decay=0)