def compute_grad(self, episodes):
        ng_grads = []
        for train_episodes, valid_episodes in episodes:
            params_adapt = self.adapt_first_order(train_episodes)

            # self.baseline.fit(valid_episodes)
            loss = self.inner_loss(valid_episodes, params=params_adapt)
            ng_grad_0 = torch.autograd.grad(
                loss, self.policy.parameters())  # no create graph
            ng_grad_0 = parameters_to_vector(ng_grad_0)

            self.baseline.fit(train_episodes)
            loss = self.inner_loss(train_episodes)
            grad = torch.autograd.grad(loss,
                                       self.policy.parameters(),
                                       create_graph=True)
            grad = parameters_to_vector(grad)
            grad_ng_grad_0 = torch.dot(grad, ng_grad_0)
            ng_grad_1 = torch.autograd.grad(grad_ng_grad_0,
                                            self.policy.parameters())
            ng_grad_1 = parameters_to_vector(ng_grad_1)

            ng_grad = ng_grad_0 - 0.1 * ng_grad_1

            ng_grad = parameters_to_vector(ng_grad)
            ng_grads.append(ng_grad.view(len(ng_grad), 1))

        return torch.mean(torch.stack(ng_grads, dim=1), dim=[1, 2])
 def learn(self):
     self.sample_batch()
     # imp_fac: should be a 1-D Variable or Tensor, size is the same with a.size(0)
     imp_fac = self.compute_imp_fac()
     self.estimate_value()
     self.A = (self.A - self.A.mean()) / (self.A.std() + 1e-8)
     self.loss = -(imp_fac * self.A
                   ).mean() - self.entropy_weight * self.compute_entropy()
     if self.value_type is not None:
         # update value
         for i in range(self.iters_v):
             self.update_value()
     self.policy.zero_grad()
     loss_grad = torch.autograd.grad(self.loss,
                                     self.policy.parameters(),
                                     create_graph=True)
     # loss_grad_vector is a 1-D Variable including all parameters in self.policy
     loss_grad_vector = parameters_to_vector([grad for grad in loss_grad])
     # solve Ax = -g, A is Hessian Matrix of KL divergence
     trpo_grad_direc = self.conjunction_gradient(-loss_grad_vector)
     shs = .5 * torch.sum(
         trpo_grad_direc * self.hessian_vector_product(trpo_grad_direc))
     beta = torch.sqrt(self.max_kl / shs)
     fullstep = trpo_grad_direc * beta
     gdotstepdir = -torch.sum(loss_grad_vector * trpo_grad_direc)
     theta = self.linear_search(
         parameters_to_vector(self.policy.parameters()), fullstep,
         gdotstepdir * beta)
     # update policy
     vector_to_parameters(theta, self.policy.parameters())
     self.learn_step_counter += 1
     self.cur_kl = self.mean_kl_divergence().item()
     self.policy_ent = self.compute_entropy().item()
Exemplo n.º 3
0
    def adapt(self, episodes, first_order=False):
        """Adapt the parameters of the policy network to a new task, from 
        sampled trajectories `episodes`, with a one-step gradient update [1].
        """
        # Fit the baseline to the training episodes
        self.baseline.fit(episodes)
        
        params = None
        
        info = AttrDict()
        loss = self.inner_loss(episodes, params)
        
        info.pre_update_loss = loss.detach().cpu().numpy()
        
        for _ in range(self.inner_steps):
            # Get the new parameters after a one-step gradient update
            params = self.policy.update_params(
                loss, step_size=self.fast_lr, first_order=first_order,
                params=params
            )
            
            # Get the loss on the training episodes
            loss = self.inner_loss(episodes, params)
            
        info.post_update_loss = loss.detach().cpu().numpy()
        
        info.weight_change = torch.norm(
            parameters_to_vector(self.policy.parameters())
            - parameters_to_vector(params.values())
        ).detach().cpu().numpy()

        return params, info
Exemplo n.º 4
0
    def _fisher_vector_product(self, vector_p_with_state_batch):
        """
        b=Hx 에서 x를 구하려면 H의 역행렬을 알아야 한다.
        근데 H의 역행렬 구하기 어렵다.
        그래서 피셔 벡터곱으로 Hx를 대충 추정해서 넘겨주자.

        D_KL
        ∇D_KL
        (∇D_KL)^T * x
        ∇((∇D_KL)^T * x)

        Hx = ∇((∇D_KL(새로운θ|옛날θ))^T * x)
        """
        (p, s_batch) = vector_p_with_state_batch
        p.detach()
        # 왜 같은게 들어가냐면 현재 policy에 대한 다이버전스를 구하는 거라서
        kl = kl_divergence(new_actor=self.actor,
                           old_actor=self.actor,
                           s_batch=s_batch)
        kl = kl.mean()
        kl_grad = autograd.grad(kl, self.actor.parameters(), create_graph=True)
        kl_grad = parameters_to_vector(kl_grad)  # check kl_grad == 0

        kl_grad_p = (kl_grad * p).sum()
        kl_hessian_p = autograd.grad(kl_grad_p, self.actor.parameters())
        kl_hessian_p = parameters_to_vector(kl_hessian_p)

        return kl_hessian_p + self.damping_coeff * p
Exemplo n.º 5
0
    def optim_value_lbfgs(self, V_target, inds):
        value = self.value
        value.zero_grad()
        loss_fn = self.loss_func_v

        def V_closure():
            predicted = value(self.s[inds],
                              other_data=self.other_data[inds] if
                              self.other_data is not None else None).squeeze()
            loss = loss_fn(predicted, V_target)
            self.value_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            return loss

        old_params = parameters_to_vector(value.parameters())
        for lr in self.lr * .5**np.arange(10):
            optimizer = optim.LBFGS(self.value.parameters(), lr=lr)
            optimizer.step(V_closure)
            current_params = parameters_to_vector(value.parameters())
            if any(np.isnan(current_params.data.cpu().numpy())):
                print("LBFGS optimization diverged. Rolling back update...")
                vector_to_parameters(old_params, value.parameters())
            else:
                return
    def step(self,
             episodes,
             max_kl=1e-3,
             cg_iters=10,
             cg_damping=1e-2,
             ls_max_steps=10,
             ls_backtrack_ratio=0.5):
        """Meta-optimization step (ie. update of the initial parameters), based 
        on Trust Region Policy Optimization (TRPO, [4]).
        """
        old_loss, _, old_pis = self.surrogate_loss(episodes)
        grads = torch.autograd.grad(old_loss, self.policy.parameters())
        grads = parameters_to_vector(grads)

        step = grads / torch.norm(grads)

        # Save the old parameters
        old_params = parameters_to_vector(self.policy.parameters())

        # Line search
        step_size = 1.0
        for _ in range(ls_max_steps):
            vector_to_parameters(old_params - step_size * step,
                                 self.policy.parameters())
            loss, kl, _ = self.surrogate_loss(episodes, old_pis=old_pis)
            improve = loss - old_loss
            if (improve.item() < 0.0) and (kl.item() < max_kl):
                break
            step_size *= ls_backtrack_ratio
        else:
            vector_to_parameters(old_params, self.policy.parameters())
Exemplo n.º 7
0
    def compute_ng_gradient_test(self,
                                 episodes,
                                 max_kl=1e-3,
                                 cg_iters=20,
                                 cg_damping=1e-2,
                                 ls_max_steps=10,
                                 ls_backtrack_ratio=0.5):
        ng_grads = []
        for train_episodes, valid_episodes in episodes:
            params_adapt, step_size, _ = self.adapt_ng_test(train_episodes)

            # self.baseline.fit(valid_episodes)
            loss = self.inner_loss_lvc(valid_episodes, params=params_adapt)
            ng_grad_0 = torch.autograd.grad(
                loss, self.policy.parameters())  # no create graph
            ng_grad_0 = parameters_to_vector(ng_grad_0)

            self.baseline.fit(train_episodes)
            loss = self.inner_loss_lvc(train_episodes)
            grad = torch.autograd.grad(loss,
                                       self.policy.parameters(),
                                       create_graph=True)
            grad = parameters_to_vector(grad)
            grad_F_inv_grad = torch.dot(grad, ng_grad_0)
            ng_grad_1 = torch.autograd.grad(grad_F_inv_grad,
                                            self.policy.parameters())
            ng_grad_1 = parameters_to_vector(ng_grad_1)

            ng_grad = ng_grad_0 - step_size * ng_grad_1

            ng_grad = parameters_to_vector(ng_grad)
            ng_grads.append(ng_grad.view(len(ng_grad), 1))

        return torch.mean(torch.stack(ng_grads, dim=1), dim=[1, 2])
Exemplo n.º 8
0
    def adapt(self,
              episodes,
              first_order=False,
              max_kl=1e-3,
              cg_iters=20,
              cg_damping=1e-2,
              ls_max_steps=10,
              ls_backtrack_ratio=0.5):
        """Adapt the parameters of the policy network to a new task, from 
        sampled trajectories `episodes`, with a one-step natural gradient update.
        """
        # Fit the baseline to the training episodes
        self.baseline.fit(episodes)
        # Get the loss on the training episodes
        loss = self.inner_loss(episodes)
        # Get the new parameters after a one-step natural gradient update
        grads = torch.autograd.grad(loss, self.policy.parameters())
        grads = parameters_to_vector(grads)

        # Compute the step direction with Conjugate Gradient
        hessian_vector_product = self.hessian_vector_product_ng(
            episodes, damping=cg_damping)
        stepdir = conjugate_gradient(hessian_vector_product,
                                     grads,
                                     cg_iters=cg_iters)

        step = stepdir.detach()
        old_params = parameters_to_vector(self.policy.parameters())
        step_size = 1.0e-2
        params = vector_to_named_parameter_like(old_params - step_size * step,
                                                self.policy.named_parameters())
        # TODO check if params is a function of self.policy.parameters()
        return params, step_size, step
    def gradient_ascent_step(self):
        """Makes one update of policy weights"""
        
        # get loss
        loss = self.surrogate_function(write_to_log=True)
        
        # calculating gradient
        self.policy.optimizer.zero_grad()
        loss.backward(retain_graph=True)
        policy_gradient = parameters_to_vector([v.grad for v in self.policy.parameters()]).squeeze(0)        
        assert policy_gradient.nonzero().size()[0] > 0, "Policy gradient is 0. Skipping update?.."
        
        # Use conjugate gradient algorithm to determine the step direction in theta space
        step_direction = self.conjugate_gradient(-policy_gradient.cpu().numpy())

        # Do line search to determine the stepsize of theta in the direction of step_direction
        shs = step_direction.dot(self.hessian_vector_product(Tensor(step_direction)).cpu().numpy().T) / 2
        lm = np.sqrt(shs / self.config.max_kl)
        fullstep = step_direction / lm
        gdotstepdir = -policy_gradient.dot(Tensor(step_direction)).data[0]
        theta = self.linesearch(parameters_to_vector(self.policy.parameters()), fullstep, gdotstepdir / lm)

        # Update parameters of policy model
        if any(np.isnan(theta.data.cpu().numpy())):
          raise Exception("NaN detected. Skipping update...")
        else:
          vector_to_parameters(theta, self.policy.parameters())

        kl_old_new = self.mean_kl_divergence()
        self.logger["kl_change"].append(kl_old_new.item())
Exemplo n.º 10
0
    def learn_htrpo(self):
        b_t = time.time()
        self.sample_batch()
        self.split_episode()
        # No valid episode is collected
        if self.n_valid_ep == 0:
            return
        self.generate_subgoals()
        if not self.using_original_data:
            self.reset_training_data()
        if self.sampled_goal_num is None or self.sampled_goal_num > 0:
            self.generate_fake_data()
        self.data_preprocess()
        self.other_data = self.goal

        # Optimize Value Estimator
        self.estimate_value()
        if self.value_type is not None:
            # update value
            for i in range(self.iters_v):
                self.update_value()

        # Optimize Policy
        # imp_fac: should be a 1-D Variable or Tensor, size is the same with a.size(0)
        # Likelihood Ratio
        # self.estimate_value()
        imp_fac = self.compute_imp_fac()

        if self.value_type:
            # old value estimator
            self.A = self.gamma_discount * self.hratio * self.A
        else:
            self.A = self.gamma_discount * self.A

        # Here mean() and sum() / self.n_traj is equivalent, because there
        # is only a coefficient between two expressions. This coefficient
        # will be compensated by the stepsize computation in TRPO. However,
        # in vanilla PG, there is no compensation, therefore, it needs to
        # be in the exact form of the euqation in the paper.
        self.loss = - (imp_fac * self.A).mean() - self.entropy_weight * self.compute_entropy()

        self.policy.zero_grad()
        loss_grad = torch.autograd.grad(
            self.loss, self.policy.parameters(), create_graph=True)
        # loss_grad_vector is a 1-D Variable including all parameters in self.policy
        loss_grad_vector = parameters_to_vector([grad for grad in loss_grad])
        # solve Ax = -g, A is Hessian Matrix of KL divergence
        trpo_grad_direc = self.conjunction_gradient(- loss_grad_vector)
        shs = .5 * torch.sum(trpo_grad_direc * self.hessian_vector_product(trpo_grad_direc))
        beta = torch.sqrt(self.max_kl / shs)
        fullstep = trpo_grad_direc * beta
        gdotstepdir = -torch.sum(loss_grad_vector * trpo_grad_direc)
        theta = self.linear_search(parameters_to_vector(
            self.policy.parameters()), fullstep, gdotstepdir * beta)
        vector_to_parameters(theta, self.policy.parameters())
        self.learn_step_counter += 1
        self.cur_kl = self.mean_kl_divergence().item()
        self.policy_ent = self.compute_entropy().item()
        self.update_normalizer()
        print("iteration time:   {:.4f}".format(time.time()-b_t))
Exemplo n.º 11
0
    def step(self, H, step_size=1, closure=None):

        # literally no idea what this does
        loss = None
        if closure is not None:
            loss = closure()

        # set parameters
        params = [p for p in self.param_groups[0]['params']]
        grads = [p.grad for p in params]

        # convert parameters to a vector
        param_vector = parameters_to_vector(params)
        grad_vector = parameters_to_vector(grads)

        # apply rotation / contract / expansion
        soln, _ = torch.solve(
            grad_vector.unsqueeze(1).unsqueeze(0), H.unsqueeze(0))
        scaled_gradient = soln[0].reshape(-1)

        # add the charactoristic scaling
        scaling = torch.dot(scaled_gradient, soln.reshape(-1))
        scaled_gradient *= step_size * torch.sqrt(self.divergence_limit /
                                                  (scaling + self.epsilon))

        # check that the scaling is ok before updating parameters
        if scaling > 0.:
            # update the gradient weights
            vector_to_parameters(scaled_gradient, grads)

        # now we can perform the update
        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(
                            d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

        return loss
Exemplo n.º 12
0
    def compute_ng_gradient(self,
                            episodes,
                            max_kl=1e-3,
                            cg_iters=20,
                            cg_damping=1e-2,
                            ls_max_steps=10,
                            ls_backtrack_ratio=0.5):
        ng_grads = []
        for train_episodes, valid_episodes in episodes:
            params_adapt, step_size, stepdir = self.adapt_ng(
                train_episodes, cg_iters=cg_iters, cg_damping=cg_damping)

            # compute $grad = \nabla_x J^{lvc}(x) at x = \theta - \eta\UM(\theta)
            self.baseline.fit(valid_episodes)
            loss = self.inner_loss_lvc(valid_episodes, params=params_adapt)
            ng_grad_0 = torch.autograd.grad(
                loss, self.policy.parameters())  # no create graph
            ng_grad_0 = parameters_to_vector(ng_grad_0)

            # compute the inverse of Fisher matrix at x=\theta times $grad with Conjugate Gradient
            hessian_vector_product = self.hessian_vector_product_ng(
                train_episodes, damping=cg_damping)
            F_inv_grad = conjugate_gradient(hessian_vector_product,
                                            ng_grad_0,
                                            cg_iters=cg_iters * 2)

            if self.verbose:
                print(
                    torch.norm(hessian_vector_product(F_inv_grad) - ng_grad_0)
                    / torch.norm(ng_grad_0))

            # compute $ng_grad_1 = \nabla^2 J^{lvc}(x) at x = \theta times $F_inv_grad
            # create graph for higher differential
            self.baseline.fit(train_episodes)
            loss = self.inner_loss_lvc(train_episodes)

            grad = torch.autograd.grad(loss,
                                       self.policy.parameters(),
                                       create_graph=True)
            grad = parameters_to_vector(grad)
            grad_F_inv_grad = torch.dot(grad, F_inv_grad.detach())
            ng_grad_1 = torch.autograd.grad(grad_F_inv_grad,
                                            self.policy.parameters())
            ng_grad_1 = parameters_to_vector(ng_grad_1)

            # compute $ng_grad_2 = the Jacobian of {F(x) U(\theta)} at x = \theta times $F_inv_grad
            hessian_vector_product = self.hessian_vector_product_ng(
                train_episodes, damping=cg_damping)
            F_U = hessian_vector_product(stepdir)
            ng_grad_2 = torch.autograd.grad(
                torch.dot(F_U, F_inv_grad.detach()), self.policy.parameters())
            ng_grad_2 = parameters_to_vector(ng_grad_2)
            ng_grad = ng_grad_0 - step_size * (ng_grad_1 - ng_grad_2)

            ng_grad = parameters_to_vector(ng_grad)
            ng_grads.append(ng_grad.view(len(ng_grad), 1))

        return torch.mean(torch.stack(ng_grads, dim=1), dim=[1, 2])
Exemplo n.º 13
0
    def update(self, trajectory: Iterable):
        """
        Updates the current policy given a the trajectory of the policy.
        :param trajectory: a list of transition frames from the episode.
        This represents the trajectory of the episode.
        :type trajectory: Iterable
        :return: the loss from this update
        :rtype: float:
        """
        if (not isinstance(trajectory, Iterable)):
            raise ValueError("trajectory must be an Iterable.")
        # Consolidate the state in the trajectory into an array.
        states = np.array(
            [np.asarray(transition.state) for transition in trajectory])
        '''
        Compute the loss as the log-likelihood of the returns.
        '''
        # Calculate the returns.
        returns = self._calculate_returns(trajectory)
        # Calculate the values using the baseline approximator.
        values = torch.Tensor([self._value_fn(state)[0] for state in states])
        # Calculate the advantage using the returns and the values.
        advantages = returns - values
        # Compute the loss of the trajectory.
        logits = torch.stack([
            self._policy.logit(np.asarray(transition.state),
                               transition.action,
                               detach=False) for transition in trajectory
        ]).view(-1)
        loss = (-logits * advantages).mean()
        '''
        Compute the gradient and the natural policy gradient.
        '''
        # Calculate the gradient of the log likelihood loss.
        gradient = self._compute_gradient(loss)
        gradient = parameters_to_vector(gradient).detach().numpy() + 1e-5

        # Calculate the natural policy gradient.
        npg = self._compute_npg(gradient, states)
        '''
        Update the policy and the baseline.
        '''
        # The learning rate to apply for the update.
        alpha = np.sqrt(
            np.abs(self.delta / (np.dot(gradient.T,
                                        npg.detach().numpy()) + 1e-20)))
        # The amount to change the parameters by.
        update = alpha * npg
        # Calculate and set the new parameters of the policy.
        new_params = parameters_to_vector(
            self._policy.get_params(False)) - update
        self._policy.set_params(new_params.detach().numpy())

        # Update baseline approximator using the cumulative returns.
        self._value_fn.update(states, returns.detach().numpy().reshape(-1, 1))

        # Return the loss from the update.
        return loss.item()
Exemplo n.º 14
0
 def update_params(self, loss, step_size=0.5, first_order=False):
     grads = torch.autograd.grad(
         loss,
         filter(lambda p: p.requires_grad, self.parameters()),
         create_graph=not first_order,
     )
     return (parameters_to_vector(
         filter(lambda p: p.requires_grad, self.parameters())) -
             parameters_to_vector(grads) * step_size)
Exemplo n.º 15
0
    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            # HESSIAN VEC COMPUTATION
            # vectorize all parameters
            grad_vec = parameters_to_vector(group['params'])
            # create noise vector
            noise = torch.normal(means=torch.zeros_like(grad_vec), std=self.noise_factor)
            # compute the product
            grad_product = torch.sum(grad_vec * noise)
            grad_grad = torch.autograd.grad(
                grad_product, group['params'], retain_graph=True
            )
            # h_v_p = hessian_vec_product
            fisher_vec_prod = torch.cat([g.contiguous().view(-1) for g in grad_grad])
            hessian_vec_prod = fisher_vec_prod + (self.cg_damping * noise)

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad
                d_p = p.grad.clone().data

                # REST OF SGD STUFF
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf
                p.data.add_(-group['lr'], d_p)
            flattened = parameters_to_vector(group['params'])
            flattened.data.add_(group['lr'], hessian_vec_prod.data)
            vector_to_parameters(flattened, group['params'])

        return loss
Exemplo n.º 16
0
    def optimize(self):
        self.total_steps += self.steps_per_train

        if self.total_steps >= self.learning_start:
            experience_sample = ray.get(
                self.experience_replay.sample.remote(self.batch_size))
            state = torch.cat([
                torch.from_numpy(s.state).cuda().unsqueeze(0)
                for s in experience_sample
            ])
            next_state = torch.cat([
                torch.from_numpy(s.next_state).cuda().unsqueeze(0)
                for s in experience_sample
            ])
            terminal = (torch.tensor([s.terminal for s in experience_sample
                                      ]).cuda().unsqueeze(1))
            reward = (torch.tensor([s.reward for s in experience_sample
                                    ]).cuda().unsqueeze(1))
            action = torch.tensor([s.action for s in experience_sample]).cuda()

            # Train value function
            target = (
                reward + self.gamma * (1 - terminal) * self.target_value_fn(
                    next_state, self.target_policy(next_state))).detach()
            actual = self.online_value_fn(state, action)
            value_fn_loss = self.value_fn_criterion(target, actual)
            value_fn_loss.backward()
            self.value_fn_opt.step()
            self.online_policy.zero_grad()
            self.online_value_fn.zero_grad()

            # Train policy
            policy_loss = -self.online_value_fn(
                state, self.online_policy(state)).mean()
            policy_loss.backward()
            self.policy_opt.step()
            self.online_policy.zero_grad()
            self.online_value_fn.zero_grad()

            # Update target networks
            v_policy = parameters_to_vector(self.online_policy.parameters())
            v_policy_targ = parameters_to_vector(
                self.target_policy.parameters())
            new_v_policy_targ = (self.polyak * v_policy_targ +
                                 (1 - self.polyak) * v_policy)
            vector_to_parameters(new_v_policy_targ,
                                 self.target_policy.parameters())

            v_value_fn = parameters_to_vector(
                self.online_value_fn.parameters())
            v_value_fn_targ = parameters_to_vector(
                self.target_value_fn.parameters())
            new_v_value_fn_targ = (self.polyak * v_value_fn_targ +
                                   (1 - self.polyak) * v_value_fn)
            vector_to_parameters(new_v_value_fn_targ,
                                 self.target_value_fn.parameters())
Exemplo n.º 17
0
        def _product(vector):
            kl = self.kl_divergence(episodes, old_pis=None)
            grads = torch.autograd.grad(kl, self.policy.parameters(), create_graph=True)
            flat_grad_kl = parameters_to_vector(grads)

            grad_kl_v = torch.dot(flat_grad_kl, vector)
            grad2s = torch.autograd.grad(grad_kl_v, self.policy.parameters())
            flat_grad2_kl = parameters_to_vector(grad2s)

            return flat_grad2_kl + damping * vector
Exemplo n.º 18
0
    def learn(self, env, max_iter, batch_size):
        for i_iter in xrange(max_iter):
            s = env.reset()
            self._noise_generator.reset()
            done = False
            add_noise = i_iter * 1.0 / max_iter < self.explore_fraction
            e_reward = 0
            while not done:
                # env.render()
                noise = torch.FloatTensor(
                    self._noise_generator.generate()) if add_noise else None
                a = self.act(s, noise=noise)
                s_, r, done, info = env.step(a)
                self._replay_module.add(tuple((s, a, [r], s_, [int(done)])))
                s = s_
                e_reward += r

                if len(self._replay_module) < self.warmup_size:
                    continue
                # sample batch transitions
                b_s, b_a, b_r, b_s_, b_d = self._replay_module.sample(
                    batch_size)
                b_s = numpy.vstack(b_s)
                b_a = numpy.vstack(b_a)
                b_s, b_a, b_r, b_d = map(
                    lambda ryo: Variable(torch.FloatTensor(ryo)),
                    [b_s, b_a, b_r, b_d])
                b_s_ = Variable(torch.FloatTensor(b_s_), volatile=True)

                # update critic
                self._optimizer_critic.zero_grad()
                y = b_r + self.reward_gamma * self._target_critic(
                    b_s_, self._target_actor(b_s_)) * (1 - b_d)
                loss = self.loss(self._critic(b_s, b_a), y)
                loss.backward()
                self._optimizer_critic.step()

                # update actor
                self._optimizer_actor.zero_grad()
                loss = -self._critic(
                    b_s, self._actor(b_s)).mean()  # dpg, eq6 in [1]
                loss.backward()
                self._optimizer_actor.step()

                # update target networks
                for target, normal in [(self._target_actor, self._actor),
                                       (self._target_critic, self._critic)]:
                    target_vec = parameters_to_vector(target.parameters())
                    normal_vec = parameters_to_vector(normal.parameters())
                    vector_to_parameters(
                        (1 - self.tau) * target_vec + self.tau * normal_vec,
                        target.parameters())
            logger.info('Iter: {}, E_Reward: {}'.format(
                i_iter, round(e_reward, 2)))
Exemplo n.º 19
0
 def meta_loss(self, mini_batch, mini_batch_valid):
     loss = self.loss(mini_batch)
     params_mdl = self.mdl.update_params(loss['model_loss'], step_size=self.fast_lr,first_order=False)
     params_kg = self.kg.update_params(loss['model_loss'], step_size=self.fast_lr,first_order=False)
     old_params_mdl = parameters_to_vector(self.mdl.parameters())
     old_params_kg = parameters_to_vector(filter(lambda p: p.requires_grad, self.kg.parameters()))
     vector_to_parameters(params_mdl, self.mdl.parameters())
     vector_to_parameters(params_kg, filter(lambda p: p.requires_grad, self.kg.parameters()))
     loss1 = self.loss(mini_batch_valid)
     vector_to_parameters(old_params_mdl, self.mdl.parameters())
     vector_to_parameters(old_params_kg, filter(lambda p: p.requires_grad, self.kg.parameters()))
     return loss1
    def step(self,
             episodes,
             max_kl=1e-3,
             cg_iters=10,
             cg_damping=1e-2,
             ls_max_steps=10,
             ls_backtrack_ratio=0.5):
        """Meta-optimization step (ie. update of the initial parameters), based 
        on Trust Region Policy Optimization (TRPO, [4]).
        """
        old_loss, _, old_pis = self.surrogate_loss(episodes)
        print('old_loss: ', old_loss)
        # although old_loss is e-8 magnitude, grads is not very small
        grads = torch.autograd.grad(old_loss, self.policy.parameters())
        grads = parameters_to_vector(grads)
        print('grads: ', grads)

        # Compute the step direction with Conjugate Gradient
        hessian_vector_product = self.hessian_vector_product(
            episodes, damping=cg_damping)
        stepdir = conjugate_gradient(hessian_vector_product,
                                     grads,
                                     cg_iters=cg_iters)

        # Compute the Lagrange multiplier
        shs = 0.5 * torch.dot(
            stepdir,
            hessian_vector_product(stepdir))  # dot of 3 matrices, sT.H.s
        lagrange_multiplier = torch.sqrt(shs / max_kl)
        '''? neglect difference of pi and old_pi?'''
        # step is only calculated once with all ratio to be 1.
        step = stepdir / lagrange_multiplier
        print('step: ', step)

        # Save the old parameters
        old_params = parameters_to_vector(self.policy.parameters())

        # Line search
        step_size = 1.0
        for _ in range(ls_max_steps):
            # assign values to policy network parameters
            # step is fixed during line search
            vector_to_parameters(old_params - step_size * step,
                                 self.policy.parameters())
            # print('oldpis: ', old_pis)
            loss, kl, _ = self.surrogate_loss(episodes, old_pis=old_pis)
            improve = loss - old_loss
            if (improve.item() < 0.0) and (kl.item() < max_kl):
                break
            step_size *= ls_backtrack_ratio
        else:
            vector_to_parameters(old_params, self.policy.parameters())
Exemplo n.º 21
0
 def _product(vector):
     kl = self.kl_divergence(episodes)
     grads = torch.autograd.grad(kl, self.policy.parameters(), 
             create_graph=True)
     flat_grad_kl = parameters_to_vector(grads)
     grad_kl_v = torch.dot(flat_grad_kl, vector)
     grad2s = torch.autograd.grad(grad_kl_v, self.policy.parameters())
     grad2s_copy = []
     for item in grad2s:
         item = item.contiguous(); grad2s_copy.append(item)
     grad2s = tuple(grad2s_copy)
     flat_grad2_kl = parameters_to_vector(grad2s)
     return flat_grad2_kl + damping * vector 
Exemplo n.º 22
0
        def _product(vector):
            kl = self.kl_divergence(episodes, inner_losses)
            grads = torch.autograd.grad(kl,
                                        self.parameters(),
                                        retain_graph=True,
                                        create_graph=True)
            flat_grad_kl = parameters_to_vector(grads)

            grad_kl_v = torch.dot(flat_grad_kl, vector)
            grad2s = torch.autograd.grad(grad_kl_v,
                                         self.parameters(),
                                         retain_graph=True)
            flat_grad2_kl = parameters_to_vector(grad2s)

            return flat_grad2_kl + damping * vector
Exemplo n.º 23
0
    def step(self,
             episodes,
             max_kl=1e-3,
             cg_iters=10,
             cg_damping=1e-2,
             ls_max_steps=10,
             ls_backtrack_ratio=0.5):
        """Meta-optimization step (ie. update of the initial parameters), based 
        on Trust Region Policy Optimization (TRPO, [4]).
        """
        old_loss, _, old_pis = self.surrogate_loss(episodes)

        if old_loss is None:
            # nothing needs to be done
            return

        grads = torch.autograd.grad(old_loss, self.policy.parameters())
        grads = parameters_to_vector(grads)

        # Compute the step direction with Conjugate Gradient
        hessian_vector_product = self.hessian_vector_product(
            episodes, damping=cg_damping)
        stepdir = conjugate_gradient(hessian_vector_product,
                                     grads,
                                     cg_iters=cg_iters)

        # Compute the Lagrange multiplier
        shs = 0.5 * torch.dot(stepdir, hessian_vector_product(stepdir))
        lagrange_multiplier = torch.sqrt(shs / max_kl)

        step = stepdir / lagrange_multiplier

        # Save the old parameters
        old_params = parameters_to_vector(self.policy.parameters())

        # Line search
        step_size = 1.0
        for _ in range(ls_max_steps):
            vector_to_parameters(old_params - step_size * step,
                                 self.policy.parameters())
            loss, kl, _ = self.surrogate_loss(episodes, old_pis=old_pis)
            improve = loss - old_loss
            # if the new loss is smaller, and kl divergence is small enough (so the new policy is not too far away)
            if (improve.item() < 0.0) and (kl.item() < max_kl):
                break
            step_size *= ls_backtrack_ratio
        else:
            vector_to_parameters(old_params, self.policy.parameters())
Exemplo n.º 24
0
    def step(self,
             episodes,
             max_kl=1e-3,
             cg_iters=10,
             cg_damping=1e-2,
             ls_max_steps=10,
             ls_backtrack_ratio=0.5):
        """Meta-optimization step (ie. update of the initial parameters), based 
        on Trust Region Policy Optimization (TRPO, [4]).
        """
        old_loss, _, old_pis = self.surrogate_loss(episodes)
        grads = torch.autograd.grad(old_loss, self.policy.parameters())
        grads = parameters_to_vector(grads)

        # Compute the step direction with Conjugate Gradient
        hessian_vector_product = self.hessian_vector_product(
            episodes, damping=cg_damping)
        stepdir = conjugate_gradient(hessian_vector_product,
                                     grads,
                                     cg_iters=cg_iters)

        # Compute the Lagrange multiplier
        shs = 0.5 * torch.dot(stepdir, hessian_vector_product(stepdir))
        lagrange_multiplier = torch.sqrt(shs / max_kl)

        step = stepdir / lagrange_multiplier

        # Save the old parameters
        old_params = parameters_to_vector(self.policy.parameters())

        # Line search
        step_size = 2.0
        for _ in range(ls_max_steps):
            vector_to_parameters(old_params - step_size * step,
                                 self.policy.parameters())
            loss, kl, _ = self.surrogate_loss(episodes, old_pis=old_pis)
            improve = loss - old_loss
            if (improve.item() < 0.0) and (kl.item() < max_kl):
                # if improve.item() < 0.0:
                print("New Actor surrogate_loss: ", loss)
                break
            step_size *= ls_backtrack_ratio
        else:
            print("same actor~~~~")
            vector_to_parameters(old_params, self.policy.parameters())
            if self.policy.paramsFlag == OrderedDict(
                    self.policy.named_parameters()):
                print("really same~~~~~~~~")
Exemplo n.º 25
0
    def optimize(self):
        # Return if no completed episodes
        if len(self.buffers['completed_rewards']) == 0:
            return

        # Convert all buffers to tensors
        num_batch_steps = len(self.buffers['completed_rewards'])
        rewards = torch.tensor(self.buffers['completed_rewards'])
        actions = torch.stack(self.buffers['actions'][:num_batch_steps])
        states = torch.stack(self.buffers['states'][:num_batch_steps])
        log_probs = torch.stack(self.buffers['log_probs'][:num_batch_steps])
        rewards, actions, states, log_probs = (rewards.to(self.device),
                                               actions.to(self.device),
                                               states.to(self.device),
                                               log_probs.to(self.device))

        # Normalize rewards over episodes
        rewards = (rewards - rewards.mean()) / rewards.std()

        # Save current parameters
        self.optim.zero_grad()
        old_policy_param = parameters_to_vector(
            [param for param in self.policy.parameters()]).detach().clone()
        old_std_param = self.logstd.detach().clone()

        # Compute regular gradient and step
        (-log_probs * rewards.view(-1, 1)).mean().backward()
        self.optim.step()

        # Find search direction by Adam
        new_policy_param = parameters_to_vector(
            [param for param in self.policy.parameters()]).detach()
        policy_gradients = new_policy_param - old_policy_param
        std_gradients = self.logstd.detach() - old_std_param

        # Restore old policy
        vector_to_parameters(old_policy_param, self.policy.parameters())
        with torch.no_grad():
            self.logstd[:] = old_std_param

        # Find new policy and std with line search using Adam gradient
        self.line_search(policy_gradients, std_gradients, states, actions,
                         log_probs, rewards)

        # Update buffers removing processed steps
        for key, storage in self.buffers.items():
            if key != 'episode_reward':
                del storage[:num_batch_steps]
Exemplo n.º 26
0
    def compute_preconditioner(self, policy, state_tensor, action_tensor,
                               current_loss):

        # """ CHECK IF THE CURRENT LOSS USES A REPLAY BUFFER """
        # if current_loss.include_buffer and current_loss.buffer_init:
        #     # if so, add in the buffer states to the precompute stuff
        #     state_tensor = torch.cat([state_tensor.float(), current_loss.replay_buffer.buffer_states])
        #     action_tensor = torch.cat([action_tensor.float(), current_loss.replay_buffer.buffer_actions])
        """ CONVERT FORMAT """
        flat_states = torch.flatten(state_tensor, start_dim=0, end_dim=1)
        flat_actions = torch.flatten(action_tensor, start_dim=0, end_dim=1)
        """ COMPUTE FIRST STEP """
        # create copy
        policy_copy = copy.deepcopy(policy)
        # evaluate loss
        score = policy_copy(flat_states[0, :], flat_actions[0, :])
        # step
        score.backward()
        # get gradients
        grad_i = parameters_to_vector(
            [p.grad for p in policy_copy.parameters()])
        # take outer product
        H = torch.ger(grad_i, grad_i)
        # delete copy
        del policy_copy
        """ STEP THROUGH DATA AND BUILD FISHER INFO """
        for i in range(1, action_tensor.size()[0]):
            # create copy
            policy_copy = copy.deepcopy(policy)
            # zero the parameter gradients
            policy_copy.zero_grad()
            # evaluate loss
            score = policy_copy(flat_states[i, :], flat_actions[i, :])
            # step
            score.backward()
            # get gradients
            grad_i = parameters_to_vector(
                [p.grad for p in policy_copy.parameters()])
            # take outer product
            H = torch.ger(grad_i, grad_i)
            # delete copy
            del policy_copy
        """ STABALIZE FOR USE LATER """
        preconditioner = H / action_tensor.size()[0]
        preconditioner += torch.tensor(1e-4) * torch.eye(
            preconditioner.size()[0])
        """ RETURN THE PRECONDITIONED MATRIX """
        return preconditioner
Exemplo n.º 27
0
    def line_search(self, gradients, states, actions, log_probs, rewards):
        step_size = (2 * self.kl_delta / gradients.dot(
            self.fisher_vector_direct(gradients, states))).sqrt()
        step_size_decay = 1.5
        line_search_attempts = 10

        # New policy
        current_parameters = parameters_to_vector(self.policy.parameters())
        new_policy = deepcopy(self.policy)
        vector_to_parameters(current_parameters + step_size * gradients,
                             new_policy.parameters())
        new_std = self.logstd.detach() + step_size * self.logstd.grad

        #  Shrink gradient until KL constraint met and improvement
        for attempt in range(line_search_attempts):
            # Obtain kl divergence and objective
            with torch.no_grad():
                kl_value = self.kl(new_policy, new_std, states)
                objective = self.surrogate_objective(new_policy, new_std,
                                                     states, actions,
                                                     log_probs, rewards)

            # Shrink gradient if KL constraint not met or reward lower
            if kl_value > self.kl_delta or objective < 0:
                step_size /= step_size_decay
                vector_to_parameters(
                    current_parameters + step_size * gradients,
                    new_policy.parameters())
                new_std = self.logstd.detach() + step_size * self.logstd.grad
            #  Return new policy and std if KL and reward met
            else:
                return new_policy, new_std.requires_grad_()

        # Return old policy and std if constraints never met
        return self.policy, self.logstd
Exemplo n.º 28
0
def train_mt(params):
    env_fun, iters, animate, camera, model = params
    env = env_fun(animate=False, camera=camera)
    obs_dim, act_dim = env.obs_dim, env.act_dim

    policy = NN(obs_dim, act_dim).float()
    w = parameters_to_vector(policy.parameters()).detach().numpy()
    es = cma.CMAEvolutionStrategy(w, 0.5)

    print(
        "Env: {} Action space: {}, observation space: {}, N_params: {}, comments: ..."
        .format("Ant_reach", act_dim, obs_dim, len(w)))

    sims = [mujoco_py.MjSim(model) for _ in range(es.popsize)]
    policies = [policy] * es.popsize

    ctr = 0
    try:
        while not es.stop():
            ctr += 1
            if ctr > iters:
                break
            if ctr % 1000 == 0:
                sdir = os.path.join(
                    os.path.dirname(os.path.realpath(__file__)),
                    "agents/{}.p".format(env_fun.__name__))
                vector_to_parameters(
                    torch.from_numpy(es.result.xbest).float(),
                    policy.parameters())
                T.save(policy, sdir)
                print("Saved checkpoint")
            X = es.ask()

            output = mp.Queue()
            processes = []
            for i, ef, sim, policy, x in zip(range(es.popsize),
                                             [env_fun] * es.popsize, sims,
                                             policies, X):
                processes.append(
                    mp.Process(target=f_mp,
                               args=(i, ef, sim, policy, x, output)))

            # Run processes
            for p in processes:
                p.start()

            # Exit the completed processes
            for p in processes:
                p.join()

            evals = [output.get() for _ in processes]
            evals.sort(key=lambda x: x[0])
            evals = [ev[1] for ev in evals]

            es.tell(X, evals)
            es.disp()
    except KeyboardInterrupt:
        print("User interrupted process.")

    return es.result.fbest
Exemplo n.º 29
0
    def __init__(self, policy_params, trained_weights=None):
        Policy.__init__(self, policy_params)
        self.net = MLP_probs(self.ob_dim, self.ac_dim)
        #lin_policy = np.load('/home/harshit/work/ARS/trained_policies/Policy_Testerbi2/bi_policy_num_plus149.npz')

        #lin_policy = lin_policy.items()[0][1]
        #self.weights=None

        self.weights = parameters_to_vector(
            self.net.parameters()).detach().double().numpy()
        if trained_weights is not None:
            #print("hieohrfoiahfoidanfkjahdfj")
            self.net.load_state_dict(torch.load(trained_weights))
            #vector_to_parameters(torch.tensor(trained_weights), self.net.parameters())
            self.weights = parameters_to_vector(
                self.net.parameters()).detach().double().numpy()
Exemplo n.º 30
0
def train(params):
    env_fun, iters, animate, camera, _ = params

    env = env_fun(animate=animate, camera=camera)
    obs_dim, act_dim = env.obs_dim, env.act_dim
    policy = NN(obs_dim, act_dim).float()
    w = parameters_to_vector(policy.parameters()).detach().numpy()
    es = cma.CMAEvolutionStrategy(w, 0.5)
    f = f_wrapper(env, policy, animate)

    print(
        "Env: {} Action space: {}, observation space: {}, N_params: {}, comments: ..."
        .format(env_fun.__name__, act_dim, obs_dim, len(w)))
    it = 0
    try:
        while not es.stop():
            it += 1
            if it > iters:
                break
            if it % 1000 == 0:
                sdir = os.path.join(
                    os.path.dirname(os.path.realpath(__file__)),
                    "agents/{}.p".format(env_fun.__name__))
                vector_to_parameters(
                    torch.from_numpy(es.result.xbest).float(),
                    policy.parameters())
                T.save(policy, sdir)
                print("Saved checkpoint")
            X = es.ask()
            es.tell(X, [f(x) for x in X])
            es.disp()
    except KeyboardInterrupt:
        print("User interrupted process.")

    return es.result.fbest