Python Critic примеры использования

Язык программирования: Python

Пространство имен/Пакет: networks.network

Класс/Тип: Critic

Примеров на hotexamples.com: 9

Python Critic - 9 примеров найдено. Это лучшие примеры Python кода для networks.network.Critic, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Critic(5)

parameters(5)

cuda(1)

load_state_dict(1)

state_dict(1)

Пример #1

Показать файл

    def __init__(self, writer, device, state_dim, action_dim, args, noise):
        super(DDPG, self).__init__()
        self.device = device
        self.writer = writer

        self.args = args
        self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function, self.args.last_activation, self.args.trainable_std)

        self.target_actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function, self.args.last_activation, self.args.trainable_std)

        self.q = Critic(self.args.layer_num, state_dim + action_dim, 1,
                        self.args.hidden_dim, self.args.activation_function,
                        None)

        self.target_q = Critic(self.args.layer_num, state_dim + action_dim, 1,
                               self.args.hidden_dim,
                               self.args.activation_function, None)

        self.soft_update(self.q, self.target_q, 1.)
        self.soft_update(self.actor, self.target_actor, 1.)

        self.q_optimizer = optim.Adam(self.q.parameters(), lr=self.args.q_lr)

        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.args.actor_lr)
        self.data = ReplayBuffer(action_prob_exist=False,
                                 max_size=int(self.args.memory_size),
                                 state_dim=state_dim,
                                 num_action=action_dim)

        self.noise = noise

Пример #2

Показать файл

    def __init__(self, writer, device, state_dim, action_dim, args):
        super(PPO,self).__init__()
        self.args = args
        
        self.data = ReplayBuffer(action_prob_exist = True, max_size = self.args.traj_length, state_dim = state_dim, num_action = action_dim)
        self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function,self.args.last_activation,self.args.trainable_std)
        self.critic = Critic(self.args.layer_num, state_dim, 1, \
                             self.args.hidden_dim, self.args.activation_function,self.args.last_activation)
        
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.args.critic_lr)

        self.writer = writer
        self.device = device

Пример #3

Показать файл

Файл: sac.py Проект: seolhokim/Mujoco-Pytorch

    def __init__(self, writer, device, state_dim, action_dim, args):
        super(SAC, self).__init__()
        self.args = args
        self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function, self.args.last_activation, self.args.trainable_std)

        self.q_1 = Critic(self.args.layer_num, state_dim + action_dim, 1,
                          self.args.hidden_dim, self.args.activation_function,
                          self.args.last_activation)
        self.q_2 = Critic(self.args.layer_num, state_dim + action_dim, 1,
                          self.args.hidden_dim, self.args.activation_function,
                          self.args.last_activation)

        self.target_q_1 = Critic(self.args.layer_num, state_dim + action_dim,
                                 1, self.args.hidden_dim,
                                 self.args.activation_function,
                                 self.args.last_activation)
        self.target_q_2 = Critic(self.args.layer_num, state_dim + action_dim,
                                 1, self.args.hidden_dim,
                                 self.args.activation_function,
                                 self.args.last_activation)

        self.soft_update(self.q_1, self.target_q_1, 1.)
        self.soft_update(self.q_2, self.target_q_2, 1.)

        self.alpha = nn.Parameter(torch.tensor(self.args.alpha_init))

        self.data = ReplayBuffer(action_prob_exist=False,
                                 max_size=int(self.args.memory_size),
                                 state_dim=state_dim,
                                 num_action=action_dim)
        self.target_entropy = -torch.tensor(action_dim)

        self.q_1_optimizer = optim.Adam(self.q_1.parameters(),
                                        lr=self.args.q_lr)
        self.q_2_optimizer = optim.Adam(self.q_2.parameters(),
                                        lr=self.args.q_lr)

        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.args.actor_lr)
        self.alpha_optimizer = optim.Adam([self.alpha], lr=self.args.alpha_lr)

        self.device = device
        self.writer = writer

Пример #4

Показать файл

Файл: sac.py Проект: seolhokim/Mujoco-Pytorch

class SAC(nn.Module):
    def __init__(self, writer, device, state_dim, action_dim, args):
        super(SAC, self).__init__()
        self.args = args
        self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function, self.args.last_activation, self.args.trainable_std)

        self.q_1 = Critic(self.args.layer_num, state_dim + action_dim, 1,
                          self.args.hidden_dim, self.args.activation_function,
                          self.args.last_activation)
        self.q_2 = Critic(self.args.layer_num, state_dim + action_dim, 1,
                          self.args.hidden_dim, self.args.activation_function,
                          self.args.last_activation)

        self.target_q_1 = Critic(self.args.layer_num, state_dim + action_dim,
                                 1, self.args.hidden_dim,
                                 self.args.activation_function,
                                 self.args.last_activation)
        self.target_q_2 = Critic(self.args.layer_num, state_dim + action_dim,
                                 1, self.args.hidden_dim,
                                 self.args.activation_function,
                                 self.args.last_activation)

        self.soft_update(self.q_1, self.target_q_1, 1.)
        self.soft_update(self.q_2, self.target_q_2, 1.)

        self.alpha = nn.Parameter(torch.tensor(self.args.alpha_init))

        self.data = ReplayBuffer(action_prob_exist=False,
                                 max_size=int(self.args.memory_size),
                                 state_dim=state_dim,
                                 num_action=action_dim)
        self.target_entropy = -torch.tensor(action_dim)

        self.q_1_optimizer = optim.Adam(self.q_1.parameters(),
                                        lr=self.args.q_lr)
        self.q_2_optimizer = optim.Adam(self.q_2.parameters(),
                                        lr=self.args.q_lr)

        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.args.actor_lr)
        self.alpha_optimizer = optim.Adam([self.alpha], lr=self.args.alpha_lr)

        self.device = device
        self.writer = writer

    def put_data(self, transition):
        self.data.put_data(transition)

    def soft_update(self, network, target_network, rate):
        for network_params, target_network_params in zip(
                network.parameters(), target_network.parameters()):
            target_network_params.data.copy_(target_network_params.data *
                                             (1.0 - rate) +
                                             network_params.data * rate)

    def get_action(self, state):
        mu, std = self.actor(state)
        dist = Normal(mu, std)
        u = dist.rsample()
        u_log_prob = dist.log_prob(u)
        a = torch.tanh(u)
        a_log_prob = u_log_prob - torch.log(1 - torch.square(a) + 1e-3)
        return a, a_log_prob.sum(-1, keepdim=True)

    def q_update(self, Q, q_optimizer, states, actions, rewards, next_states,
                 dones):
        ###target
        with torch.no_grad():
            next_actions, next_action_log_prob = self.get_action(next_states)
            q_1 = self.target_q_1(next_states, next_actions)
            q_2 = self.target_q_2(next_states, next_actions)
            q = torch.min(q_1, q_2)
            v = (1 - dones) * (q - self.alpha * next_action_log_prob)
            targets = rewards + self.args.gamma * v

        q = Q(states, actions)
        loss = F.smooth_l1_loss(q, targets)
        q_optimizer.zero_grad()
        loss.backward()
        q_optimizer.step()
        return loss

    def actor_update(self, states):
        now_actions, now_action_log_prob = self.get_action(states)
        q_1 = self.q_1(states, now_actions)
        q_2 = self.q_2(states, now_actions)
        q = torch.min(q_1, q_2)

        loss = (self.alpha.detach() * now_action_log_prob - q).mean()
        self.actor_optimizer.zero_grad()
        loss.backward()
        self.actor_optimizer.step()
        return loss, now_action_log_prob

    def alpha_update(self, now_action_log_prob):
        loss = (-self.alpha *
                (now_action_log_prob + self.target_entropy).detach()).mean()
        self.alpha_optimizer.zero_grad()
        loss.backward()
        self.alpha_optimizer.step()
        return loss

    def train_net(self, batch_size, n_epi):
        data = self.data.sample(shuffle=True, batch_size=batch_size)
        states, actions, rewards, next_states, dones = convert_to_tensor(
            self.device, data['state'], data['action'], data['reward'],
            data['next_state'], data['done'])

        ###q update
        q_1_loss = self.q_update(self.q_1, self.q_1_optimizer, states, actions,
                                 rewards, next_states, dones)
        q_2_loss = self.q_update(self.q_2, self.q_2_optimizer, states, actions,
                                 rewards, next_states, dones)

        ### actor update
        actor_loss, prob = self.actor_update(states)

        ###alpha update
        alpha_loss = self.alpha_update(prob)

        self.soft_update(self.q_1, self.target_q_1, self.args.soft_update_rate)
        self.soft_update(self.q_2, self.target_q_2, self.args.soft_update_rate)
        if self.writer != None:
            self.writer.add_scalar("loss/q_1", q_1_loss, n_epi)
            self.writer.add_scalar("loss/q_2", q_2_loss, n_epi)
            self.writer.add_scalar("loss/actor", actor_loss, n_epi)
            self.writer.add_scalar("loss/alpha", alpha_loss, n_epi)

Пример #5

Показать файл

def train_wasserstein(config):
    # extractor = Extractor(n_flattens=config['n_flattens'], n_hiddens=config['n_hiddens'])
    extractor = InceptionV1(num_classes=32)
    classifier = Classifier(n_flattens=config['n_flattens'],
                            n_hiddens=config['n_hiddens'],
                            n_class=config['n_class'])
    critic = Critic(n_flattens=config['n_flattens'],
                    n_hiddens=config['n_hiddens'])
    if torch.cuda.is_available():
        extractor = extractor.cuda()
        classifier = classifier.cuda()
        critic = critic.cuda()

    triplet_type = config['triplet_type']
    gamma = config['w_gamma']
    weight_wd = config['w_weight']
    weight_triplet = config['t_weight']
    t_margin = config['t_margin']
    t_confidence = config['t_confidence']
    k_critic = 3
    k_clf = 1
    TRIPLET_START_INDEX = 95

    if triplet_type == 'none':
        res_dir = os.path.join(
            config['res_dir'],
            'bs{}-lr{}-w{}-gamma{}'.format(config['batch_size'], config['lr'],
                                           weight_wd, gamma))
        if not os.path.exists(res_dir):
            os.makedirs(res_dir)
        extractor_path = os.path.join(res_dir, "extractor.pth")
        classifier_path = os.path.join(res_dir, "classifier.pth")
        critic_path = os.path.join(res_dir, "critic.pth")
        EPOCH_START = 1
        TEST_INTERVAL = 10

    else:
        TEST_INTERVAL = 1
        w_dir = os.path.join(
            config['res_dir'],
            'bs{}-lr{}-w{}-gamma{}'.format(config['batch_size'], config['lr'],
                                           weight_wd, gamma))
        if not os.path.exists(w_dir):
            os.makedirs(w_dir)
        res_dir = os.path.join(
            w_dir, '{}_t_weight{}_margin{}_confidence{}'.format(
                triplet_type, weight_triplet, t_margin, t_confidence))
        if not os.path.exists(res_dir):
            os.makedirs(res_dir)
        extractor_path = os.path.join(w_dir, "extractor.pth")
        classifier_path = os.path.join(w_dir, "classifier.pth")
        critic_path = os.path.join(w_dir, "critic.pth")

        if os.path.exists(extractor_path):
            extractor.load_state_dict(torch.load(extractor_path))
            classifier.load_state_dict(torch.load(classifier_path))
            critic.load_state_dict(torch.load(critic_path))
            print('load models')
            EPOCH_START = TRIPLET_START_INDEX
        else:
            EPOCH_START = 1

    set_log_config(res_dir)
    print('start epoch {}'.format(EPOCH_START))
    print('triplet type {}'.format(triplet_type))
    print(config)

    logging.debug('train_wt')
    logging.debug(extractor)
    logging.debug(classifier)
    logging.debug(critic)
    logging.debug(config)

    criterion = torch.nn.CrossEntropyLoss()
    softmax_layer = nn.Softmax(dim=1)

    critic_opt = torch.optim.Adam(critic.parameters(), lr=config['lr'])
    classifier_opt = torch.optim.Adam(classifier.parameters(), lr=config['lr'])
    feature_opt = torch.optim.Adam(extractor.parameters(),
                                   lr=config['lr'] / 10)

    def train(extractor, classifier, critic, config, epoch):
        extractor.train()
        classifier.train()
        critic.train()

        iter_source = iter(config['source_train_loader'])
        iter_target = iter(config['target_train_loader'])
        len_source_loader = len(config['source_train_loader'])
        len_target_loader = len(config['target_train_loader'])
        num_iter = len_source_loader
        for step in range(1, num_iter):
            data_source, label_source = iter_source.next()
            data_target, _ = iter_target.next()
            if step % len_target_loader == 0:
                iter_target = iter(config['target_train_loader'])
            if torch.cuda.is_available():
                data_source, label_source = data_source.cuda(
                ), label_source.cuda()
                data_target = data_target.cuda()

            # 1. train critic
            set_requires_grad(extractor, requires_grad=False)
            set_requires_grad(classifier, requires_grad=False)
            set_requires_grad(critic, requires_grad=True)
            with torch.no_grad():
                h_s = extractor(data_source)
                h_s = h_s.view(h_s.size(0), -1)
                h_t = extractor(data_target)
                h_t = h_t.view(h_t.size(0), -1)

            for j in range(k_critic):
                gp = gradient_penalty(critic, h_s, h_t)
                critic_s = critic(h_s)
                critic_t = critic(h_t)
                wasserstein_distance = critic_s.mean() - critic_t.mean()
                critic_cost = -wasserstein_distance + gamma * gp

                critic_opt.zero_grad()
                critic_cost.backward()
                critic_opt.step()

                if step == 10 and j == 0:
                    print('EPOCH {}, DISCRIMINATOR: wd {}, gp {}, loss {}'.
                          format(epoch, wasserstein_distance.item(),
                                 (gamma * gp).item(), critic_cost.item()))
                    logging.debug(
                        'EPOCH {}, DISCRIMINATOR: wd {}, gp {}, loss {}'.
                        format(epoch, wasserstein_distance.item(),
                               (gamma * gp).item(), critic_cost.item()))

            # 2. train feature and class_classifier
            set_requires_grad(extractor, requires_grad=True)
            set_requires_grad(classifier, requires_grad=True)
            set_requires_grad(critic, requires_grad=False)
            for _ in range(k_clf):
                h_s = extractor(data_source)
                h_s = h_s.view(h_s.size(0), -1)
                h_t = extractor(data_target)
                h_t = h_t.view(h_t.size(0), -1)

                source_preds = classifier(h_s)
                clf_loss = criterion(source_preds, label_source)
                wasserstein_distance = critic(h_s).mean() - critic(h_t).mean()

                if triplet_type != 'none' and epoch >= TRIPLET_START_INDEX:
                    target_preds = classifier(h_t)
                    target_labels = target_preds.data.max(1)[1]
                    target_logits = softmax_layer(target_preds)
                    if triplet_type == 'all':
                        triplet_index = np.where(
                            target_logits.data.max(1)[0].cpu().numpy() >
                            t_margin)[0]
                        images = torch.cat((h_s, h_t[triplet_index]), 0)
                        labels = torch.cat(
                            (label_source, target_labels[triplet_index]), 0)
                    elif triplet_type == 'src':
                        images = h_s
                        labels = label_source
                    elif triplet_type == 'tgt':
                        triplet_index = np.where(
                            target_logits.data.max(1)[0].cpu().numpy() >
                            t_confidence)[0]
                        images = h_t[triplet_index]
                        labels = target_labels[triplet_index]
                    elif triplet_type == 'sep':
                        triplet_index = np.where(
                            target_logits.data.max(1)[0].cpu().numpy() >
                            t_confidence)[0]
                        images = h_t[triplet_index]
                        labels = target_labels[triplet_index]
                        t_loss_sep, _ = triplet_loss(extractor, {
                            "X": images,
                            "y": labels
                        }, t_confidence)
                        images = h_s
                        labels = label_source

                    t_loss, _ = triplet_loss(extractor, {
                        "X": images,
                        "y": labels
                    }, t_margin)
                    loss = clf_loss + \
                        weight_wd * wasserstein_distance + \
                        weight_triplet * t_loss
                    if triplet_type == 'sep':
                        loss += t_loss_sep
                    feature_opt.zero_grad()
                    classifier_opt.zero_grad()
                    loss.backward()
                    feature_opt.step()
                    classifier_opt.step()

                    if step == 10:
                        print(
                            'EPOCH {}, CLASSIFIER: clf_loss {}, wd {}, t_loss {}, total loss {}'
                            .format(epoch, clf_loss.item(),
                                    weight_wd * wasserstein_distance.item(),
                                    weight_triplet * t_loss.item(),
                                    loss.item()))
                        logging.debug(
                            'EPOCH {}, CLASSIFIER: clf_loss {}, wd {}, t_loss {}, total loss {}'
                            .format(epoch, clf_loss.item(),
                                    weight_wd * wasserstein_distance.item(),
                                    weight_triplet * t_loss.item(),
                                    loss.item()))

                else:
                    loss = clf_loss + weight_wd * wasserstein_distance
                    feature_opt.zero_grad()
                    classifier_opt.zero_grad()
                    loss.backward()
                    feature_opt.step()
                    classifier_opt.step()

                    if step == 10:
                        print(
                            'EPOCH {}, CLASSIFIER: clf_loss {}, wd {},  loss {}'
                            .format(epoch, clf_loss.item(),
                                    weight_wd * wasserstein_distance.item(),
                                    loss.item()))
                        logging.debug(
                            'EPOCH {}, CLASSIFIER: clf_loss {}, wd {},  loss {}'
                            .format(epoch, clf_loss.item(),
                                    weight_wd * wasserstein_distance.item(),
                                    loss.item()))

    # pretrain(model, config, pretrain_epochs=20)
    for epoch in range(EPOCH_START, config['n_epochs'] + 1):
        train(extractor, classifier, critic, config, epoch)
        if epoch % TEST_INTERVAL == 0:
            # print('test on source_test_loader')
            # test(extractor, classifier, config['source_test_loader'], epoch)
            # print('test on target_train_loader')
            # test(model, config['target_train_loader'], epoch)
            print('test on target_test_loader')
            test(extractor, classifier, config['target_test_loader'], epoch)
        if epoch % config['VIS_INTERVAL'] == 0:
            if triplet_type == 'none':
                title = '(a) WDGRL'
            else:
                title = '(b) TLADA'
            draw_confusion_matrix(extractor, classifier,
                                  config['target_test_loader'], res_dir, epoch,
                                  title)
            draw_tsne(extractor,
                      classifier,
                      config['source_train_loader'],
                      config['target_test_loader'],
                      res_dir,
                      epoch,
                      title,
                      separate=True)
            # draw_tsne(extractor, classifier, config['source_test_loader'], config['target_test_loader'], res_dir, epoch, title, separate=False)
    if triplet_type == 'none':
        torch.save(extractor.state_dict(), extractor_path)
        torch.save(classifier.state_dict(), classifier_path)
        torch.save(critic.state_dict(), critic_path)

Пример #6

Показать файл

class DDPG(nn.Module):
    def __init__(self, writer, device, state_dim, action_dim, args, noise):
        super(DDPG, self).__init__()
        self.device = device
        self.writer = writer

        self.args = args
        self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function, self.args.last_activation, self.args.trainable_std)

        self.target_actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function, self.args.last_activation, self.args.trainable_std)

        self.q = Critic(self.args.layer_num, state_dim + action_dim, 1,
                        self.args.hidden_dim, self.args.activation_function,
                        None)

        self.target_q = Critic(self.args.layer_num, state_dim + action_dim, 1,
                               self.args.hidden_dim,
                               self.args.activation_function, None)

        self.soft_update(self.q, self.target_q, 1.)
        self.soft_update(self.actor, self.target_actor, 1.)

        self.q_optimizer = optim.Adam(self.q.parameters(), lr=self.args.q_lr)

        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.args.actor_lr)
        self.data = ReplayBuffer(action_prob_exist=False,
                                 max_size=int(self.args.memory_size),
                                 state_dim=state_dim,
                                 num_action=action_dim)

        self.noise = noise

    def soft_update(self, network, target_network, rate):
        for network_params, target_network_params in zip(
                network.parameters(), target_network.parameters()):
            target_network_params.data.copy_(target_network_params.data *
                                             (1.0 - rate) +
                                             network_params.data * rate)

    def get_action(self, x):
        return self.actor(x)[0] + torch.tensor(self.noise.sample()).to(
            self.device), self.actor(x)[1]

    def put_data(self, transition):
        self.data.put_data(transition)

    def train_net(self, batch_size, n_epi):
        data = self.data.sample(shuffle=True, batch_size=batch_size)
        states, actions, rewards, next_states, dones = convert_to_tensor(
            self.device, data['state'], data['action'], data['reward'],
            data['next_state'], data['done'])

        targets = rewards + self.args.gamma * (1 - dones) * self.target_q(
            next_states,
            self.target_actor(next_states)[0])
        q_loss = F.smooth_l1_loss(self.q(states, actions), targets.detach())
        self.q_optimizer.zero_grad()
        q_loss.backward()
        self.q_optimizer.step()

        actor_loss = -self.q(states, self.actor(states)[0]).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.soft_update(self.q, self.target_q, self.args.soft_update_rate)
        self.soft_update(self.actor, self.target_actor,
                         self.args.soft_update_rate)
        if self.writer != None:
            self.writer.add_scalar("loss/q", q_loss, n_epi)
            self.writer.add_scalar("loss/actor", actor_loss, n_epi)

Пример #7

Показать файл

class PPO(nn.Module):
    def __init__(self, writer, device, state_dim, action_dim, args):
        super(PPO,self).__init__()
        self.args = args
        
        self.data = ReplayBuffer(action_prob_exist = True, max_size = self.args.traj_length, state_dim = state_dim, num_action = action_dim)
        self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function,self.args.last_activation,self.args.trainable_std)
        self.critic = Critic(self.args.layer_num, state_dim, 1, \
                             self.args.hidden_dim, self.args.activation_function,self.args.last_activation)
        
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.args.critic_lr)

        self.writer = writer
        self.device = device
        
    def get_action(self,x):
        mu,sigma = self.actor(x)
        return mu,sigma
    
    def v(self,x):
        return self.critic(x)
    
    def put_data(self,transition):
        self.data.put_data(transition)
        
    def get_gae(self, states, rewards, next_states, dones):
        values = self.v(states).detach()
        td_target = rewards + self.args.gamma * self.v(next_states) * (1 - dones)
        delta = td_target - values
        delta = delta.detach().cpu().numpy()
        advantage_lst = []
        advantage = 0.0
        for idx in reversed(range(len(delta))):
            if dones[idx] == 1:
                advantage = 0.0
            advantage = self.args.gamma * self.args.lambda_ * advantage + delta[idx][0]
            advantage_lst.append([advantage])
        advantage_lst.reverse()
        advantages = torch.tensor(advantage_lst, dtype=torch.float).to(self.device)
        return values, advantages
    
    def train_net(self,n_epi):
        data = self.data.sample(shuffle = False)
        states, actions, rewards, next_states, dones, old_log_probs = convert_to_tensor(self.device, data['state'], data['action'], data['reward'], data['next_state'], data['done'], data['log_prob'])
        
        old_values, advantages = self.get_gae(states, rewards, next_states, dones)
        returns = advantages + old_values
        advantages = (advantages - advantages.mean())/(advantages.std()+1e-3)
        
        for i in range(self.args.train_epoch):
            for state,action,old_log_prob,advantage,return_,old_value \
            in make_mini_batch(self.args.batch_size, states, actions, \
                                           old_log_probs,advantages,returns,old_values): 
                curr_mu,curr_sigma = self.get_action(state)
                value = self.v(state).float()
                curr_dist = torch.distributions.Normal(curr_mu,curr_sigma)
                entropy = curr_dist.entropy() * self.args.entropy_coef
                curr_log_prob = curr_dist.log_prob(action).sum(1,keepdim = True)

                #policy clipping
                ratio = torch.exp(curr_log_prob - old_log_prob.detach())
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1-self.args.max_clip, 1+self.args.max_clip) * advantage
                actor_loss = (-torch.min(surr1, surr2) - entropy).mean() 
                
                #value clipping (PPO2 technic)
                old_value_clipped = old_value + (value - old_value).clamp(-self.args.max_clip,self.args.max_clip)
                value_loss = (value - return_.detach().float()).pow(2)
                value_loss_clipped = (old_value_clipped - return_.detach().float()).pow(2)
                critic_loss = 0.5 * self.args.critic_coef * torch.max(value_loss,value_loss_clipped).mean()
                
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                nn.utils.clip_grad_norm_(self.actor.parameters(), self.args.max_grad_norm)
                self.actor_optimizer.step()
                
                self.critic_optimizer.zero_grad()
                critic_loss.backward()
                nn.utils.clip_grad_norm_(self.critic.parameters(), self.args.max_grad_norm)
                self.critic_optimizer.step()
                
                if self.writer != None:
                    self.writer.add_scalar("loss/actor_loss", actor_loss.item(), n_epi)
                    self.writer.add_scalar("loss/critic_loss", critic_loss.item(), n_epi)

Пример #8

Показать файл

Файл: ddpg.py Проект: dnotz/DRLND-Project3

class DDPGAgent():
    """ DDPG
    This class implements the DDP algorithm.
    For more information see: https://spinningup.openai.com/en/latest/algorithms/ddpg.html
    """
    def __init__(self,
                 state_size,
                 action_size,
                 fc_layer_sizes,
                 buffer_size=30000,
                 batch_size=128,
                 update_interval=16,
                 num_update_steps=1,
                 noise_std=0.2,
                 noise_reduction=0.998,
                 noise_std_min=0.05,
                 warmup=1e4,
                 tau=0.02,
                 gamma=0.99,
                 lr_actor=2e-4,
                 lr_critic=2e-4,
                 seed=0):
        """ Initialize an DDPG agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            fc_layer_sizes (list of int): Layer size of each FC layer
            buffer_size (int): the size of the replay buffer
            batch_size (int): the size of the batches for network updates
            update_interval (int): number of steps between updates
            num_update_steps (int): number of update steps in a row
            noise_std (float): std of Gaussian noise for adding to action
            noise_reduction (float): factor to reduce noise after each update
            noise_std_min (float): the minimum value of noise_std
            tau (float): soft weight update factor
            gamma (float): discount factor
            lr_actor (float): learning rate for actor
            lr_critic (float): learning rate for critic
            seed (int): random seed
        """
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.update_interval = update_interval
        self.num_update_steps = num_update_steps
        self.tau = tau
        self.gamma = gamma
        self.noise_std = noise_std
        self.noise_reduction = noise_reduction
        self.noise_std_min = noise_std_min
        self.warmup = warmup
        self.t = 0

        # seed
        np.random.seed(seed)

        # torch device
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # add replay buffer
        self.replay_buffer = ReplayBuffer(buffer_size, self.device, seed)

        # define networks, initialize target networks with original networks
        self.actor = Actor(state_size, action_size, fc_layer_sizes,
                           seed=seed).to(self.device)
        self.target_actor = Actor(state_size,
                                  action_size,
                                  fc_layer_sizes,
                                  seed=seed).to(self.device)
        self.critic = Critic(state_size,
                             action_size,
                             fc_layer_sizes,
                             seed=seed).to(self.device)
        self.target_critic = Critic(state_size,
                                    action_size,
                                    fc_layer_sizes,
                                    seed=seed).to(self.device)
        self.hard_updates()

        # define optimizers
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=lr_critic,
                                           weight_decay=0)

    def act(self, state, add_noise=True):
        """ Computes and returns the action to take

        Params
        ======
            state (list of float): current state
        """
        # input state to actor network in eval mode, get action, add Gaussian noise
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.actor.eval()
        with torch.no_grad():
            action = self.actor(state).squeeze().cpu().detach().numpy()
        self.actor.train()
        if add_noise:
            action += self.noise_std * np.random.normal(size=self.action_size)
        return action

    def step(self, state, action, reward, next_state, done):
        """ Saves step details and potentially performs network training

        Params
        ======
            state (list of float): current state
            action (list of float): action taken
            reward (float): reward received
            next_state (list of float):  next state
            done (bool): bool whether end of episode reached
        """
        self.replay_buffer.add(state, action, reward, next_state, done)
        self.t += 1
        if self.t >= self.warmup:
            if self.t % self.update_interval == 0:
                if (len(self.replay_buffer) > self.batch_size):
                    self.learn()

    def learn(self):
        """ Performs actor and critic network training """
        for _ in range(self.num_update_steps):
            # sample a random batch of experiences
            states, actions, rewards, next_states, dones = self.replay_buffer.sample(
                self.batch_size)

            # compute Q targets
            actions_next = self.target_actor(next_states)
            q_targets = rewards + self.gamma * \
                (1 - dones) * self.target_critic(next_states, actions_next)
            q_expected = self.critic(states, actions)

            # compute critic loss, update critic
            critic_loss = F.mse_loss(q_expected, q_targets)
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.critic.parameters(),
                                           .5)  # clip gradients
            self.critic_optimizer.step()

            # update actor
            actions_pred = self.actor(states)
            actor_loss = -self.critic(states, actions_pred).mean()
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # update target networks
            self.soft_updates()

        # reduce action sampling noise
        self.noise_std = max(self.noise_std * self.noise_reduction,
                             self.noise_std_min)

    def soft_updates(self):
        """ Performs a soft parameter update for target and original networks """
        for target, source in zip([self.target_actor, self.target_critic],
                                  [self.actor, self.critic]):
            for target_param, param in zip(target.parameters(),
                                           source.parameters()):
                target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                        param.data * self.tau)

    def hard_updates(self):
        """ Performs a hard parameter update for target and original networks """
        for target, source in zip([self.target_actor, self.target_critic],
                                  [self.actor, self.critic]):
            for target_param, param in zip(target.parameters(),
                                           source.parameters()):
                target_param.data.copy_(param.data)

Пример #9

Показать файл

Файл: ddpg.py Проект: dnotz/DRLND-Project3

    def __init__(self,
                 state_size,
                 action_size,
                 fc_layer_sizes,
                 buffer_size=30000,
                 batch_size=128,
                 update_interval=16,
                 num_update_steps=1,
                 noise_std=0.2,
                 noise_reduction=0.998,
                 noise_std_min=0.05,
                 warmup=1e4,
                 tau=0.02,
                 gamma=0.99,
                 lr_actor=2e-4,
                 lr_critic=2e-4,
                 seed=0):
        """ Initialize an DDPG agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            fc_layer_sizes (list of int): Layer size of each FC layer
            buffer_size (int): the size of the replay buffer
            batch_size (int): the size of the batches for network updates
            update_interval (int): number of steps between updates
            num_update_steps (int): number of update steps in a row
            noise_std (float): std of Gaussian noise for adding to action
            noise_reduction (float): factor to reduce noise after each update
            noise_std_min (float): the minimum value of noise_std
            tau (float): soft weight update factor
            gamma (float): discount factor
            lr_actor (float): learning rate for actor
            lr_critic (float): learning rate for critic
            seed (int): random seed
        """
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.update_interval = update_interval
        self.num_update_steps = num_update_steps
        self.tau = tau
        self.gamma = gamma
        self.noise_std = noise_std
        self.noise_reduction = noise_reduction
        self.noise_std_min = noise_std_min
        self.warmup = warmup
        self.t = 0

        # seed
        np.random.seed(seed)

        # torch device
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # add replay buffer
        self.replay_buffer = ReplayBuffer(buffer_size, self.device, seed)

        # define networks, initialize target networks with original networks
        self.actor = Actor(state_size, action_size, fc_layer_sizes,
                           seed=seed).to(self.device)
        self.target_actor = Actor(state_size,
                                  action_size,
                                  fc_layer_sizes,
                                  seed=seed).to(self.device)
        self.critic = Critic(state_size,
                             action_size,
                             fc_layer_sizes,
                             seed=seed).to(self.device)
        self.target_critic = Critic(state_size,
                                    action_size,
                                    fc_layer_sizes,
                                    seed=seed).to(self.device)
        self.hard_updates()

        # define optimizers
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=lr_critic,
                                           weight_decay=0)