示例#1
0
    def update(self, lastState, lastAction, reward, state):
        if lastState is None:
            return

        self.initNN(state)
        self._memory.push(
            (torch.FloatTensor([lastState]), torch.LongTensor([[lastAction]]),
             torch.FloatTensor([reward]), torch.FloatTensor([state])))

        if len(self._memory) < self._batchSize:
            return

        transitions = self._memory.sample(self._batchSize)
        batch_state, batch_action, batch_reward, batch_next_state = zip(
            *transitions)

        batch_state = torch.cat(batch_state)
        batch_action = torch.cat(batch_action)
        batch_reward = torch.cat(batch_reward)
        batch_next_state = torch.cat(batch_next_state)
        batch_prediction = self._NN(batch_state)
        batch_next_prediction = self._NN(batch_next_state)

        current_q_values = batch_prediction.gather(1, batch_action)[:, 0]
        max_next_q_values = batch_next_prediction.detach().max(1)[0]

        expected_q_values = (
            1.0 - self._alpha) * current_q_values + self._alpha * (
                batch_reward + self._gamma * max_next_q_values)

        loss = F.smooth_l1_loss(current_q_values, expected_q_values)
        self._optimizer.zero_grad()
        loss.backward()
        self._optimizer.step()
示例#2
0
    def train(self, left_img, right_img, depth_img):
        """

        :param left_img: left rgb camera image
        :param right_img: right rgb camera image
        :param depth_img: reverse depth format.
                          0 means infinite distance,
                          255 (or 1.0) may either be 255 pixels or ~0 cm from both cameras.
                          If it's reversed, the depth mask must be changed.
        :return:
        """
        self.model.train()
        self._adjust_learning_rate(self._train_calls)
        self._train_calls += 1

        # todo: auto convert 0-255 to 0-1

        img_l = Variable(torch.FloatTensor(left_img))
        img_r = Variable(torch.FloatTensor(right_img))
        true_disparity = Variable(torch.FloatTensor(depth_img))

        if torch.cuda.is_available():
            img_l, img_r, true_disparity = img_l.cuda(), img_r.cuda(
            ), true_disparity.cuda()

        finite_depth_mask = true_disparity > 0
        finite_depth_mask.detach_()

        output1, output2, output3 = self.model(img_l, img_r)
        output1 = torch.squeeze(output1, 1)
        output2 = torch.squeeze(output2, 1)
        output3 = torch.squeeze(output3, 1)
        loss = (0.5 * F.smooth_l1_loss(output1[finite_depth_mask],
                                       true_disparity[finite_depth_mask],
                                       size_average=True) +
                0.7 * F.smooth_l1_loss(output2[finite_depth_mask],
                                       true_disparity[finite_depth_mask],
                                       size_average=True) +
                F.smooth_l1_loss(output3[finite_depth_mask],
                                 true_disparity[finite_depth_mask],
                                 size_average=True))

        loss.backward()
        self.optimizer.step()

        return loss.data[0]
示例#3
0
    def update(self, s, a, r, done, s_next):
        s = torch.tensor(s, device=device)
        # s = self.policy_net.preprocess(s)
        a = torch.tensor(a, device=device)
        r = torch.tensor(r, device=device)
        done = torch.tensor(done, device=device)
        s_next = torch.tensor(s_next, device=device)
        # s_next = self.policy_net.preprocess(s_next)

        if not self.ready:
            self.memory.add(Transition(s, a, r, done, s_next))
            return

        # Using batch memory
        self.memory.add(Transition(s, a, r, done, s_next))
        if isinstance(self.memory, WeightedMemory):
            tree_idx, batch, sample_weights = self.memory.sample(self.batch_size)
            sample_weights = torch.tensor(sample_weights, device=device)
        else:
            batch = self.memory.sample(self.batch_size)
        batch_t = Transition(*zip(*batch))  # transposed batch

        # Get expected Q values
        s_batch, a_batch, r_batch, done_batch, s_next_batch = batch_t
        s_batch = torch.cat(s_batch)
        a_batch = torch.stack(a_batch)
        r_batch = torch.stack(r_batch).view(-1, 1)
        s_next_batch = torch.cat(s_next_batch)
        done_batch = torch.stack(done_batch).view(-1, 1)
        q = self.state_action_value(s_batch, a_batch)

        # Get Actual Q values

        double_actions = self.policy_net(s_next_batch).max(1)[1].detach()  # used for double q learning
        q_next = self.state_action_value(s_next_batch, double_actions)

        q_next_actual = (~done_batch) * q_next  # Removes elements thx`at are done
        q_target = r_batch + self.gamma * q_next_actual
        ###TEST if clamping works or is even good practise
        q_target = q_target.clamp(-1, 1)
        ###/TEST

        if isinstance(self.memory, WeightedMemory):
            absolute_loss = torch.abs(q - q_target).detach().cpu().numpy()
            loss = weighted_smooth_l1_loss(
                q, q_target, sample_weights
            )  # TODO fix potential non-linearities using huber loss
            self.memory.batch_update(tree_idx, absolute_loss)

        else:
            loss = F.smooth_l1_loss(q, q_target)

        self.optim.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():  # see if this ends up doing anything - should just be relu
            param.grad.data.clamp_(-1, 1)
        self.optim.step()
示例#4
0
    def smooth_l1_loss(coord, label, loc):
        pos_mask = label > 0

        pos_coord = coord[pos_mask]
        pos_loc = loc[pos_mask]

        loss = F.smooth_l1_loss(pos_loc, pos_coord, reduction='sum')

        return loss / pos_mask.sum()
示例#5
0
    def _optimize_model(self, model: torch.nn.Module, batch: np.ndarray,
                        gamma: float, device: str):
        """
        Sample batch from memory of environment transitions and train network to fit the
        temporal difference TD(0) Q-value approximation
        """
        model.train()

        # Compute a mask of non-final states and concatenate the batch elements
        non_final_mask = []
        non_final_idx = []
        non_final_next_states = []

        batch_size = len(batch)

        for idx, state in enumerate(batch.next_state):
            if state is not None:
                non_final_mask.append(True)
                non_final_idx.append(idx)
                non_final_next_states.append(state)
            else:
                non_final_mask.append(False)
        non_final_mask = torch.ByteTensor(non_final_mask)
        # non_final_idx = np.array(non_final_idx)
        non_final_next_states = torch.cat(non_final_next_states)

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken
        state_action_values = model(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_state_values = (torch.zeros(batch_size).float().to(device)
                             )  # zero for terminal states

        # what would the model predict
        next_state_values[non_final_mask] = model(non_final_next_states).max(
            1)[0]
        with torch.no_grad():
            expected_state_action_values = (
                next_state_values *
                gamma) + reward_batch  # compute the expected Q values

        loss = F.smooth_l1_loss(
            state_action_values.view(-1),
            expected_state_action_values.view(-1))  # compute Huber loss

        # optimize network
        optimizer.zero_grad()  # optimize towards expected q-values
        loss.backward()
        for param in model.parameters():
            param.grad.data.clamp_(-1, 1)
        optimizer.step()
示例#6
0
    def calc_loss(self, coord, label, loc, conf):
        # step2. hard negative mining
        num_class = conf.size(2)

        coord = coord.view(-1, 4)
        loc = loc.view(-1, 4)

        label = label.view(-1)
        conf = conf.view(-1, num_class)

        # "positive" means label is not background
        pos_mask = label != 0
        pos_conf = conf[pos_mask]
        pos_label = label[pos_mask]

        # sort background confidence by loss in descending order
        tmp = F.cross_entropy(conf, label, reduction='none')
        tmp[pos_mask] = 0.

        _, neg_indices = tmp.sort(descending=True)

        # pick num(positive_samples)*3 of negative samples per batch
        num_pos = pos_conf.size(0)
        num_neg = min(num_pos * 3, conf.size(0) - num_pos)

        neg_conf = conf[neg_indices[0:num_neg]]
        neg_label = label[neg_indices[0:num_neg]]

        conf = torch.cat([pos_conf, neg_conf], 0)
        label = torch.cat([pos_label, neg_label], 0)

        l_conf = F.cross_entropy(conf, label, reduction='sum')

        # - calc l_loc
        coord = coord[pos_mask]
        loc = loc[pos_mask]

        l_loc = F.smooth_l1_loss(loc, coord, reduction='sum')

        return (l_conf + self.alpha * l_loc) / num_pos
示例#7
0
def calculate_loss(rewards, log_probabilities, values, entropies, config):
    # print(f'rewards: {rewards}, rewards type: {type(rewards)}')
    # print(f'log_probabilities: {log_probabilities}')
    # print(f'values: {values}')

    if len(rewards) <= 0:
        return None

    discounted_rewards = []
    accumulated_rewards = 0
    for current_reward in rewards[::-1]:
        accumulated_rewards = config.reward_discount * accumulated_rewards + current_reward
        discounted_rewards.append(accumulated_rewards)

    # print(f'discounted rewards: {discounted_rewards}')

    discounted_rewards = torch.tensor(discounted_rewards[::-1]).float().to(
        config.device)
    unbiased = True if len(discounted_rewards) > 1 else False
    # print(f'unbiased: {unbiased}')
    # print(f'std+eps rewards: std: {discounted_rewards.std(unbiased=unbiased)},
    # {(discounted_rewards.std(unbiased=unbiased) + config.eps)}')
    normalized_rewards = (discounted_rewards - discounted_rewards.mean()) / \
                         (discounted_rewards.std(unbiased=unbiased) + config.eps)

    # print(f'normalized_rewards: {normalized_rewards}')

    policy_loss = []
    value_loss = []
    for reward, log_probability, value in zip(normalized_rewards,
                                              log_probabilities, values):
        policy_loss.append((reward - value) * -log_probability)
        value = value.squeeze(0).squeeze(0)
        value_loss.append(F.smooth_l1_loss(value, reward))

    # print(f'policy_loss: {policy_loss}')
    # print(f'value_loss: {value_loss}')

    return torch.stack(policy_loss).sum() + 0.5 * torch.stack(value_loss).sum()
示例#8
0
    def update(self, agent):
        """

        :param agent: The agent ID)
        """
        if len(self.replay_buffer) < self.batch_size:
            return

        nbatch = self.replay_buffer.sample(self.batch_size, tensorize=True)
        batch = nbatch[agent]

        non_final_mask = torch.ByteTensor(
            tuple(map(lambda s: not s, batch.done)))

        non_final_next_states = torch.stack(
            [s for done, s in zip(batch.done, batch.next_states) if not done])

        state_batch = batch.states
        action_batch = batch.actions
        reward_batch = batch.rewards

        state_action_values = ((self.policies[agent](state_batch) *
                                action_batch).sum(dim=1).view(-1, 1))

        next_state_values = torch.zeros(self.batch_size)
        best_actions = (self.policies[agent](non_final_next_states).argmax(
            1).unsqueeze(-1))
        next_state_values[non_final_mask] = (
            self.policy_targets[agent](non_final_next_states).gather(
                dim=1, index=best_actions).squeeze().detach())
        targets = (next_state_values * self.gamma) + reward_batch

        loss = F.smooth_l1_loss(state_action_values, targets.unsqueeze(1))
        self.policy_optimizers[agent].zero_grad()
        loss.backward()
        for param in self.policies[agent].parameters():
            param.grad.data.clamp_(-self.grad_clip, self.grad_clip)
        self.policy_optimizers[agent].step()
    def forward(self, predictions, targets):
        """
        損失関数の計算
        Args:
            predictions: SSD netの訓練時の出力(tuple)
             loc=torch.Size([num_batch, 8732, 4]),
             conf=torch.Size([num_batch, 8732, 21]),
             dbox_list=torch.Size([8732, 4])

            targets: [num_batch, num_jobs, 5]
            5は正解アノテーション情報[xmin, ymin, xmax, ymax, label_index]を示す

        Returns:
            loss_l: locの損失値 SmoothL1Loss
            loss_c: confの損失値 CrossEntropyLoss
        """

        loc_data, conf_data, dbox_list = predictions
        # print("loc_data size: ", loc_data.size())
        num_batch = loc_data.size(0)  # ミニバッチ数(*)
        num_dbox = loc_data.size(1)  # DBox数(8732)
        num_classes = conf_data.size(2)  # クラス数(21)

        # 損失計算に使用する変数
        # conf_t_label: 各DBoxに、一番近い正解のBBoxのラベルを格納 8732
        # loc_t: 各DBoxに、一番近いBBoxのいち情報を格納 8732
        conf_t_label = torch.LongTensor(num_batch,
                                        num_dbox).to(self.device)  # torch.long
        loc_t = torch.Tensor(num_batch, num_dbox,
                             4).to(self.device)  # Tensorはtorch.float32
        # print("loc_t size: ", loc_t.size())
        # print("conf_t_label size: ", conf_t_label.size())

        # loc_tとconf_t_labelに, DBoxと正解アノテーションtargets(BBox)をmatchさせた結果を上書きする
        for idx in range(num_batch):
            truths_loc = targets[idx][:, :-1].to(self.device)  # BBox
            labels_conf = targets[idx][:, -1].to(self.device)  # Labels
            # print("truths_loc size: ", truths_loc.size())
            # print("labels_conf size: ", labels_conf)

            dbox = dbox_list.to(self.device)

            # 関数matchを実行し、loc_tとconf_t_labelの内容を更新する
            # (詳細)
            # loc_t: 各DBoxに、一番近い正解のBBoxの位置情報が上書きされる
            # conf_t_label: 各DBoxに、一番近い正解のBBoxのラベルが上書きされる
            # ただし、一番近いBBoxとのjaccard係数が0.5より小さい場合は、正解BBoxのconf_t_labelは背景クラス0とする
            variance = [0.1, 0.2]
            # loc_t[idx], conf_t_label[idx] = match(self.jaccard_thresh, truths_loc, dbox, variance, labels_conf)
            match(self.jaccard_thresh, truths_loc, dbox, variance, labels_conf,
                  loc_t, conf_t_label, idx)

        # ここで、
        # loc_tは8732個の要素のうち、Positive DBoxに該当する数だけ有効な数値が入る
        # conf_t_labelは8732個の要素数は変わらず、Positive DBoxはtarget BBoxのクラスラベルが入り、Negative DBoxは背景(0)になる

        # -----
        # 位置の損失:loss_l
        # Smooth L1関数
        # ただし物体を発見したDBoxのオフセットのみを計算する
        # -----

        # 物体を検出したDBox(Positive DBox)を取り出すマスク
        pos_mask = conf_t_label > 0  # torch.Size([num_batch, 8732])

        # torch.Size([num_batch, 8732]) -> torch.Size([num_batch, 8732, 4])
        pos_idx = pos_mask.unsqueeze(pos_mask.dim()).expand_as(loc_data)

        # Positive DBoxのloc_data(位置補正情報の推論値)と教師データloc_tを取得
        loc_p = loc_data[pos_idx].view(
            -1, 4)  # Boolean Indexによる抽出後は必ず、1次元配列になるので、形状を変更する
        loc_t = loc_t[pos_idx].view(-1, 4)

        # 物体を発見したPositive DBoxのオフセット情報loc_tの損失(誤差)を計算
        loss_l = F.smooth_l1_loss(loc_p, loc_t, reduction='sum')
        # print("loc_p", loc_p)
        # print("loc_t", loc_t)
        # print("loss_l", loss_l)

        # -----
        # クラス予測の損失: loss_c
        # 交差エントロピー誤差関数
        # 背景クラスが正解のDBoxが圧倒的に多いので、Hard Negative Miningを実施し、
        # 物体発見DBoxと背景クラスDBoxの比が1:3になるようにする。
        # 背景クラスDBoxと予想したもののうち、損失が小さいものはクラス予測の損失から除く
        # -----

        batch_conf = conf_data.view(
            -1, num_classes)  # (batch_num,8732,21) -> (batch_num*8732,21)
        # print("batch_conf", batch_conf)
        # print("batch_conf size: ", batch_conf.size())

        # クラス予測の損失関数を計算(reduction='none'にして、和を取らずに次元を潰さない)
        # batch_conf size: (batch_num*8732,21), conf_t_label size: (batch_num*8732,)
        loss_c = F.cross_entropy(batch_conf,
                                 conf_t_label.view(-1),
                                 reduction='none')  # 一旦、すべてのDBoxに対して損失を計算
        # loss_c: (batch_num * 8732,)

        # -----
        # Negative DBoxのうち, Hard Negative Miningで抽出するものを求めるマスクを作成
        # -----

        # 物体を発見したPositive DBoxの損失を0にする
        # (注意) 物体はlabelが1以上.0は背景
        num_pos = pos_mask.long().sum(
            dim=1, keepdim=True
        )  # 各入力データ(画像)毎のPositive Boxの数を取得 (batch_num, 8732) -> (batch_num, 1)
        loss_c = loss_c.view(num_batch, -1)  # torch.Size([num_batch, 8732])
        loss_c[pos_mask] = 0  # 物体を発見したDBoxに対応する損失は0にする

        # Hard Negative Miningの実行
        """各DBoxの損失の大きさloss_cの順位であるidx_rankを求める"""
        _, loss_idx = loss_c.sort(dim=1,
                                  descending=True)  # 損失に基づいて各DBox(8732)を降順にソート
        _, idx_rank = loss_idx.sort(dim=1)
        # loss_rankは、DBoxの損失を降順にソートした時の元配列のインデックスの並び
        """
        (注釈)
        上2行の実装コードは特殊で直感的でない。
        やりたいことは、各DBoxに対して、損失の大きさが何番目なのかの情報をidx_rankとして高速に取得する。

        DBoxの損失値の大きい方から降順に並べ、DBoxの降順のindexをloss_idxに格納。
        損失の大きさloss_cの順位であるidx_rankを求める。
        ここで、
        降順になった配列indexであるloss_idxを0~8732までの昇順で並べ直すためには、
        何番目のloss_idxのインデックスを取ってきたら良いかを示すのが、idx_rankである。
        例えば、
        idx_rankの要素0番目 = idx_rank[0]を求めるには、loss_idxの値が0の要素、つまり
        loss_idx[?] = 0の?は何番目かを求めることになる。ここで、? = idx_rank[0]である。
        いま、loss_idx[?] = 0の0は、元のloss_cの要素の0番目という意味である。
        つまり、?は、元のloss_cの要素0番目は、降順に並び替えられたloss_idxの何番目ですか
        を求めていることになり、結果、? = idx_rank[0]はloss_cの要素0番目が降順の何番目かを示す。

        e.g
        loss_c                      3.2  5.8  1.3  2.5  4.0
        sorted_loss_c               5.8  4.0  3.2  2.5  1.3
        descending_of_loss_c_index    1    4    0    3    2 (loss_idx)
        sorted_loss_idx               0    1    2    3    4
        ascending_of_loss_idx         2    0    4    3    1 (idx_rank)

        """

        # 背景のDBoxの数num_negを決める。Hard Negative Miningにより、物体を発見したDBoxの数num_posの3倍(self.negpos_ratio)とする。
        # 万が一、DBoxの数を超える場合は、DBoxを上限とする
        num_neg = torch.clamp(num_pos * self.negpos_ratio, max=num_dbox)

        # 背景のDBoxの数num_negよりも順位が低い(損失が大きい)DBoxを抽出するマスク
        neg_mask = idx_rank < num_neg.expand_as(idx_rank)

        # -----
        # (終了)
        # -----

        # Negative DBoxのうち、Hard Negative Miningで抽出するものを求めるマスクを作成

        # pos_mask: torch.Size([num_batch, 8732]) -> pos_idx_mask: torch.Size([num_batch, 8732, 21])
        pos_idx_mask = pos_mask.unsqueeze(2).expand_as(conf_data)
        neg_idx_mask = neg_mask.unsqueeze(2).expand_as(conf_data)

        # posとnegだけを取り出してconf_hnmにする。torch.Size([num_pos + num_neg, 21])
        # gtは greater than (>)の略称。これでmaskが1のindexを取り出す。
        conf_hnm = conf_data[(pos_idx_mask + neg_idx_mask).gt(0)].view(
            -1, num_classes)

        # posとnegだけのconf_t_label torch.Size([pos + neg])
        conf_t_label_hnm = conf_t_label[(pos_mask + neg_mask).gt(0)]

        # confidenceの損失関数を計算
        loss_c = F.cross_entropy(conf_hnm, conf_t_label_hnm, reduction='sum')
        # print("conf_hnm", conf_hnm)
        # print("conf_t_label_num", conf_t_label_hnm)
        # print("loss_c", loss_c)

        # 物体を発見したBBoxの数N(全ミニバッチの合計)で損失を割り算
        N = num_pos.sum()
        loss_l /= N
        loss_c /= N

        return loss_l, loss_c
示例#10
0
    def train_agent(self, memory_buffer):

        states, actions, log_probs_old, returns, advantages = memory_buffer.cat(
            ['s', 'a', 'log_pi_a', 'ret', 'adv'])
        actions = actions.detach()
        log_probs_old = log_probs_old.detach()
        advantages = (advantages - advantages.mean()) / advantages.std()

        sum_returns = 0
        sum_advantage = 0
        sum_policy_loss = 0
        sum_critic_loss = 0
        sum_entropy = 0
        batch_steps = 0

        # lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.opt, T_max=config.optimization_epochs, eta_min=self.min_lr)

        self.network.train()

        config = self.config

        for ep in range(config.optimization_epochs):
            sampler = random_sample(np.arange(states.size(0)),
                                    config.mini_batch_size)

            for batch_indices in sampler:
                batch_indices = tensor(batch_indices).long()
                sampled_states = states[batch_indices]
                sampled_actions = actions[batch_indices]
                sampled_log_probs_old = log_probs_old[batch_indices]
                sampled_returns = returns[batch_indices]
                sampled_advantages = advantages[batch_indices]

                #this activates only part of the network responsible for V and log_policy
                #actions in this case are already provided and won't be calculated!
                prediction = self.network(sampled_states.cuda(),
                                          sampled_actions.cuda())

                #ratio is a diff between old and newly calcualted policy
                ratio = (prediction['log_pi_a'] - sampled_log_probs_old).exp()

                obj = ratio * sampled_advantages

                # gradient clip (1 - epsilon / 1 + epsilon happens here)
                obj_clipped = ratio.clamp(
                    1.0 - self.config.ppo_ratio_clip,
                    1.0 + self.config.ppo_ratio_clip) * sampled_advantages

                # entropy_weight is a factor for entropy boost - it should be set to 0 once the training stabilises
                policy_loss = torch.min(obj, obj_clipped).mean(
                ) + config.entropy_weight * prediction['ent'].mean()

                # Huber loss
                value_loss = F.smooth_l1_loss(prediction['v'],
                                              sampled_returns.view(-1, 1))

                sum_returns, sum_advantage, sum_policy_loss, sum_critic_loss, sum_entropy = \
                    self.log_stats(sampled_returns, sampled_advantages, policy_loss, value_loss,
                                   prediction['ent'].mean(),
                                   batch_steps, sum_returns, sum_advantage, sum_critic_loss, sum_policy_loss,
                                   sum_entropy)
                batch_steps += 1

                self.opt.zero_grad()
                (-(policy_loss - value_loss)).backward()
                nn.utils.clip_grad_norm_(self.network.parameters(),
                                         config.gradient_clip)
                self.opt.step()

            # lr_scheduler.step()

        return batch_steps
示例#11
0
    def smooth_l1_loss(coord, loc):
        loss = F.smooth_l1_loss(loc, coord, reduction='sum')

        return loss