Exemplo n.º 1
0
    def _update_policy_fit(self, m, init=False):
        """
        Re-estimate the local policy values in the neighborhood of the
        trajectory.
        Args:
            m: Condition
            init: Whether this is the initial fitting of the policy.
        """
        dX, dU, T = self.dX, self.dU, self.T
        # Choose samples to use.
        samples = self.cur[m].sample_list
        N = len(samples)
        pol_info = self.cur[m].pol_info
        X = samples.get_X()
        obs = samples.get_obs().copy()
        pol_mu, pol_sig = self.policy_opt.prob(obs)[:2]
        pol_info.pol_mu, pol_info.pol_sig = pol_mu, pol_sig

        # Update policy prior.
        policy_prior = pol_info.policy_prior
        if init:
            samples = SampleList(self.cur[m].sample_list)
            mode = self._hyperparams['policy_sample_mode']
        else:
            samples = SampleList([])
            mode = 'add'  # Don't replace with empty samples
        policy_prior.update(samples, self.policy_opt, mode)

        # Fit linearization and store in pol_info.
        pol_info.pol_K, pol_info.pol_k, pol_info.pol_S = \
                policy_prior.fit(X, pol_mu, pol_sig)
        for t in range(T):
            pol_info.chol_pol_S[t, :, :] = \
                    sp.linalg.cholesky(pol_info.pol_S[t, :, :])
Exemplo n.º 2
0
 def get_samples(self, condition, start=0, end=None):
     """
     Return the requested samples based on the start and end indices.
     Args:
         start: Starting index of samples to return.
         end: End index of samples to return.
     """
     return (SampleList(self._samples[condition][start:]) if end is None
             else SampleList(self._samples[condition][start:end]))
Exemplo n.º 3
0
 def _update_policy_fit(self, m, init=False):
     """
     Re-estimate the local policy values in the neighborhood of the
     trajectory.
     Args:
         m: Condition
         init: Whether this is the initial fitting of the policy.
     """
     dX, dU, T = self.dX, self.dU, self.T
     # Choose samples to use.
     samples = self.cur[m].sample_list
     N = len(samples)
     pol_info = self.cur[m].pol_info
     X = samples.get_X()
     pol_mu, pol_sig = self.policy_opt.prob(samples.get_obs().copy())[:2]
     pol_info.pol_mu, pol_info.pol_sig = pol_mu, pol_sig
     # Update policy prior.
     if init:
         self.cur[m].pol_info.policy_prior.update(
             samples, self.policy_opt,
             SampleList(self.cur[m].pol_info.policy_samples)
         )
     else:
         self.cur[m].pol_info.policy_prior.update(
             SampleList([]), self.policy_opt,
             SampleList(self.cur[m].pol_info.policy_samples)
         )
     # Collapse policy covariances. This is not really correct, but
     # it works fine so long as the policy covariance doesn't depend
     # on state.
     pol_sig = np.mean(pol_sig, axis=0)
     # Estimate the policy linearization at each time step.
     for t in range(T):
         # Assemble diagonal weights matrix and data.
         dwts = (1.0 / N) * np.ones(N)
         Ts = X[:, t, :]
         Ps = pol_mu[:, t, :]
         Ys = np.concatenate((Ts, Ps), axis=1)
         # Obtain Normal-inverse-Wishart prior.
         mu0, Phi, mm, n0 = self.cur[m].pol_info.policy_prior.eval(Ts, Ps)
         sig_reg = np.zeros((dX+dU, dX+dU))
         # On the first time step, always slightly regularize covariance.
         if t == 0:
             sig_reg[:dX, :dX] = 1e-8 * np.eye(dX)
         # Perform computation.
         pol_K, pol_k, pol_S = gauss_fit_joint_prior(Ys, mu0, Phi, mm, n0,
                                                     dwts, dX, dU, sig_reg)
         pol_S += pol_sig[t, :, :]
         pol_info.pol_K[t, :, :], pol_info.pol_k[t, :] = pol_K, pol_k
         pol_info.pol_S[t, :, :], pol_info.chol_pol_S[t, :, :] = \
                 pol_S, sp.linalg.cholesky(pol_S)
Exemplo n.º 4
0
    def _update_policy_fit(self, m):
        """
        Re-estimate the local policy values in the neighborhood of the
        trajectory.
        Args:
            m: Condition
        """
        dX, dU, T = self.dX, self.dU, self.T
        # Choose samples to use.
        samples = self.cur[m].sample_list
        N = len(samples)
        pol_info = self.cur[m].pol_info
        X = samples.get_X()
        obs = samples.get_obs().copy()
        pol_mu, pol_sig = self.policy_opt.prob(obs)[:2]
        pol_info.pol_mu, pol_info.pol_sig = pol_mu, pol_sig

        # Update policy prior.
        policy_prior = pol_info.policy_prior
        samples = SampleList(self.cur[m].sample_list)
        mode = self._hyperparams['policy_sample_mode']
        print "mode: ", mode
        policy_prior.update(samples, self.policy_opt, mode)

        # Fit linearization and store in pol_info.
        pol_info.pol_K, pol_info.pol_k, pol_info.pol_S = \
                policy_prior.fit(X, pol_mu, pol_sig)
        for t in range(T):
            pol_info.chol_pol_S[t, :, :] = \
                    sp.linalg.cholesky(pol_info.pol_S[t, :, :])
Exemplo n.º 5
0
    def _test_policy_samples(self, N=None):
        """
        test sample from the policy and collect the costs
        Args:
            N:

        Returns:
            samples
            costs:      list of cost for each condition
            ee_point:   list of ee_point for each condition

        """
        if 'verbose_policy_trials' not in self._hyperparams:
            return None
        verbose = self._hyperparams['verbose_policy_trials']
        pol_samples = [[None] for _ in range(len(self._test_idx))]
        costs = list()
        ee_points = list()
        for cond in range(len(self._test_idx)):
            pol_samples[cond][0] = self.agent.sample(
                self.algorithm.policy_opt.policy,
                self._test_idx[cond],
                verbose=verbose,
                save=False,
                noisy=False)
            # in algorithm.py: _eval_cost
            policy_cost = self.algorithm.cost[0].eval(pol_samples[cond][0])[0]
            policy_cost = np.sum(policy_cost)  #100 step
            costs.append(policy_cost)
            ee_points.append(self.agent.get_ee_point(cond))
        return [SampleList(samples)
                for samples in pol_samples], costs, ee_points
Exemplo n.º 6
0
def take_policy_samples(agent, policy, conditions, n):
    return [
        SampleList([
            agent.sample(policy, cond, save=False, noisy=False)
            for _ in range(n)
        ]) for cond in range(conditions)
    ]
    def _take_policy_samples(self, N=None):
        """
        Take samples from the policy to see how it's doing.
        Args:
            N  : number of policy samples to take per condition
        Returns: None
        """
        print(
            " ================================ test policy ===================================="
        )
        if 'verbose_policy_trials' not in self._hyperparams:
            # AlgorithmTrajOpt
            return None
        verbose = self._hyperparams['verbose_policy_trials']

        if self.gui:
            self.gui.set_status_text('Taking policy samples.')

        pol_samples = [[None] for _ in range(len(self._test_idx))]
        # Since this isn't noisy, just take one sample.
        # TODO: Make this noisy? Add hyperparam?
        # TODO: Take at all conditions for GUI?
        for cond in range(len(self._test_idx)):
            pol_samples[cond][0] = self.agent.sample(
                self.algorithm.policy_opt.policy,
                self._test_idx[cond],
                verbose=verbose,
                save=False,
                noisy=False)

        return [SampleList(samples) for samples in pol_samples]
Exemplo n.º 8
0
 def _take_policy_samples(self, N=None):
     """
     Take samples from the policy to see how it's doing.
     Args:
         N  : number of policy samples to take per condition
     Returns: None
     """
     if 'verbose_policy_trials' not in self._hyperparams:
         # AlgorithmTrajOpt
         return None
     verbose = self._hyperparams[
         'verbose_policy_trials']  # bool as to whether to use verbose trails or not
     if self.gui:
         self.gui.set_status_text('Taking policy samples.')
     pol_samples = [[None] for _ in range(len(self._test_idx))]
     # Since this isn't noisy, just take one sample.
     # TODO: Make this noisy? Add hyperparam?
     # TODO: Take at all conditions for GUI?
     for cond in range(len(self._test_idx)):
         pol_samples[cond][0] = self.agent.sample(
             self.algorithm.policy_opt.policy,
             self._test_idx[cond],
             verbose=verbose,
             save=False,
             noisy=False
         )  # iterate through problem instantiations, accumulating policy samples from LQR policy
     return [SampleList(samples) for samples in pol_samples
             ]  # return samples, held in SampleList objects
Exemplo n.º 9
0
    def _take_policy_samples(self,
                             N,
                             pol,
                             rnd=False,
                             randomize_initial_state=0):
        """Takes samples from the policy without exploration noise.

        Args:
            N: number of policy samples to take per condition.
            pol: Policy to sample. Specify `None` to use sample local LQR policies.
            rnd: Use random reset states.
            randomize_initial_state: Randomize initial state.

        """
        if pol is None:
            pol_samples = [[None] * N] * len(self._test_idx)
            for i, cond in enumerate(self._test_idx, 0):
                for n in trange(
                        N,
                        desc='Taking LQR-policy samples m=%d, cond=%s' %
                    (cond, 'rnd' if rnd else cond)):
                    pol_samples[i][n] = self.agent.sample(
                        self.algorithm.cur[cond].traj_distr,
                        None,
                        save=False,
                        noisy=False,
                        reset_cond=None if rnd else cond,
                        randomize_initial_state=randomize_initial_state,
                        record=False)
            return [SampleList(samples) for samples in pol_samples]
        else:
            conds = self._test_idx if not rnd else [None]
            # stores where the policy has lead to
            pol_samples = [[None] * N] * len(conds)
            for i, cond in enumerate(conds):
                for n in trange(N,
                                desc='Taking %s policy samples cond=%s' %
                                (type(pol).__name__, 'rnd' if rnd else cond)):
                    pol_samples[i][n] = self.agent.sample(
                        pol,
                        None,
                        save=False,
                        noisy=False,
                        reset_cond=cond,
                        randomize_initial_state=randomize_initial_state,
                        record=n < 0)
            return [SampleList(samples) for samples in pol_samples]
Exemplo n.º 10
0
    def __init__(self, hyperparams):
        config = copy.deepcopy(ALG)
        config.update(hyperparams)
        self._hyperparams = config

        if 'train_conditions' in hyperparams:
            self._cond_idx = hyperparams['train_conditions']
            self.M = len(self._cond_idx)
        else:
            self.M = hyperparams['conditions']
            self._cond_idx = range(self.M)
        self.iteration_count = 0

        # Grab a few values from the agent.
        agent = self._hyperparams['agent']
        self.T = self._hyperparams['T'] = agent.T
        self.dU = self._hyperparams['dU'] = agent.dU
        self.dX = self._hyperparams['dX'] = agent.dX
        self.dO = self._hyperparams['dO'] = agent.dO

        init_traj_distr = config['init_traj_distr']
        init_traj_distr['x0'] = agent.x0
        init_traj_distr['dX'] = agent.dX
        init_traj_distr['dU'] = agent.dU
        del self._hyperparams['agent']  # Don't want to pickle this.

        # IterationData objects for each condition.
        self.cur = [IterationData() for _ in range(self.M)]
        self.prev = [IterationData() for _ in range(self.M)]
        self.traj_distr = {self.iteration_count: []}
        self.traj_info = {self.iteration_count: []}
        self.kl_div = {self.iteration_count: []}
        self.dists_to_target = {self.iteration_count: []}
        self.sample_list = {i: SampleList([]) for i in range(self.M)}

        for m in range(self.M):
            self.cur[m].traj_info = TrajectoryInfo()
            dynamics = self._hyperparams['dynamics']
            self.cur[m].traj_info.dynamics = dynamics['type'](dynamics)
            init_traj_distr = extract_condition(
                self._hyperparams['init_traj_distr'], self._cond_idx[m])
            self.cur[m].traj_distr = init_traj_distr['type'](init_traj_distr)
            self.traj_distr[self.iteration_count].append(
                self.cur[m].traj_distr)
            self.traj_info[self.iteration_count].append(self.cur[m].traj_info)

        self.traj_opt = hyperparams['traj_opt']['type'](
            hyperparams['traj_opt'])
        self.cost = [
            hyperparams['cost']['type'](hyperparams['cost'])
            for _ in range(self.M)
        ]
        if self._hyperparams['ioc']:
            self.gt_cost = [
                hyperparams['gt_cost']['type'](hyperparams['gt_cost'])
                for _ in range(self.M)
            ]
        self.base_kl_step = self._hyperparams['kl_step']
Exemplo n.º 11
0
    def _assign_samples(self, samples, responsibilities):
        """
        Assigns samples to clusters by their responsibilities.

        """
        for m in range(self.M):
            self.cur[m].sample_list = SampleList([
                samples[i] for i in range(self.N) if responsibilities[i] == m
            ])
Exemplo n.º 12
0
 def _take_policy_samples(self, cond_list):
     pol_samples = [[] for _ in range(len(cond_list))]
     for cond in range(len(cond_list)):
         for i in range(self._hyperparams['num_samples']):
             pol_samples[cond].append(
                 self.agent.sample(self.algorithm.policy_opt.policy,
                                   cond_list[cond],
                                   save=False))
     return [SampleList(samples) for samples in pol_samples]
Exemplo n.º 13
0
def iteration(itr, cfg, agent, algorithm):
    conditions = len(cfg['common']['train_conditions'])
    samples_per_cond = cfg['num_samples']
    sample_lists = []
    for cond in range(conditions):
        policy = algorithm.cur[cond].traj_distr
        sample_list = SampleList([agent.sample(policy, cond, save=False, noisy=True) for _ in range(samples_per_cond)])
        sample_lists.append(sample_list)
    algorithm.iteration(sample_lists)
Exemplo n.º 14
0
    def _take_policy_samples(self, N=None, test_policy=False):
        """
        Take samples from the policy to see how it's doing.
        Args:
            N  : number of policy samples to take per condition
        Returns: None
        """
        if not N:
            N = self._hyperparams['verbose_policy_trials']
        if self.gui:
            self.gui.set_status_text('Taking policy samples.')

        verbose = self._hyperparams['verbose_policy_trials']
        if self.gui:
            self.gui.set_status_text('Taking policy samples.')
        pol_samples = [[None] for _ in range(len(self._test_idx))]
        # Since this isn't noisy, just take one sample.
        for cond in range(len(self._test_idx)):
            extra_args = {}
            if type(self.agent) == AgentSUPERball:
                extra_args = {
                    'superball_parameters': {
                        'reset':
                        ('reset' not in self._hyperparams['agent']
                         or self._hyperparams['agent']['reset'][cond]),
                        'relax':
                        ('relax' in self._hyperparams['agent']
                         and self._hyperparams['agent']['relax'][cond]),
                        'bottom_face':
                        (None if 'bottom_faces'
                         not in self._hyperparams['agent'] else
                         self._hyperparams['agent']['bottom_faces'][cond]),
                        'horizon':
                        (None if (not test_policy) or 'policy_test_horizon'
                         not in self._hyperparams['agent'] else
                         self._hyperparams['agent']['policy_test_horizon']),
                        'start_motor_positions':
                        (None if 'start_motor_positions'
                         not in self._hyperparams['agent'] else self.
                         _hyperparams['agent']['start_motor_positions'][cond]),
                        'motor_position_control_gain':
                        (None if 'motor_position_control_gain'
                         not in self._hyperparams['agent'] else
                         self._hyperparams['agent']
                         ['motor_position_control_gain'][cond]),
                        'debug':
                        False,
                    }
                }
            pol_samples[cond][0] = self.agent.sample(
                self.algorithm.policy_opt.policy,
                self._test_idx[cond],
                verbose=verbose,
                save=False,
                noisy=False,
                **extra_args)
        return [SampleList(samples) for samples in pol_samples]
Exemplo n.º 15
0
 def _take_policy_samples(self, N, pol, rnd=False):
     """
     Take samples from the policy to see how it's doing.
     Args:
         N  : number of policy samples to take per condition
         pol: Policy to sample. None for LQR policies.
     Returns: None
     """
     if pol is None:
         pol_samples = [[None] * N] * len(self._test_idx)
         for i, cond in enumerate(self._test_idx, 0):
             for n in trange(
                     N,
                     desc='Taking LQR-policy samples m=%d, cond=%s' %
                 (cond, 'rnd' if rnd else cond)):
                 pol_samples[i][n] = self.agent.sample(
                     self.algorithm.cur[cond].traj_distr,
                     None,
                     verbose=None,
                     save=False,
                     noisy=False,
                     reset_cond=None if rnd else cond,
                     record=False)
         return [SampleList(samples) for samples in pol_samples]
     else:
         conds = self._test_idx if not rnd else [None]
         # stores where the policy has lead to
         pol_samples = [[None] * N] * len(conds)
         for i, cond in enumerate(conds):
             for n in trange(N,
                             desc='Taking %s policy samples cond=%s' %
                             (type(pol).__name__, 'rnd' if rnd else cond)):
                 pol_samples[i][n] = self.agent.sample(pol,
                                                       None,
                                                       verbose=None,
                                                       save=False,
                                                       noisy=False,
                                                       reset_cond=cond,
                                                       record=n < 0)
         return [SampleList(samples) for samples in pol_samples]
Exemplo n.º 16
0
 def _take_policy_samples(self, cond_list, guided_steps=0, t_length=50):
     pol_samples = [[] for _ in range(len(cond_list))]
     for cond in range(len(cond_list)):
         for i in range(self._hyperparams['num_samples']):
             pol_samples[cond].append(
                 self.agent.sample(
                     self.algorithm.policy_opt.policy,
                     cond_list[cond],
                     start_policy=self.algorithm.cur[cond].traj_distr,
                     save=False,
                     ltorun=True,
                     guided_steps=guided_steps,
                     t_length=t_length))
     return [SampleList(samples) for samples in pol_samples]
Exemplo n.º 17
0
    def take_nn_samples(self, N=None):
        """
        take the NN policy
        Args:
            N:

        Returns:
            samples, costs, ee_points

        """
        """
            Take samples from the policy to see how it's doing.
            Args:
                N  : number of policy samples to take per condition
            Returns: None
            """

        if 'verbose_policy_trials' not in self._hyperparams:
            # AlgorithmTrajOpt
            return None
        verbose = self._hyperparams['verbose_policy_trials']
        if self.gui:
            self.gui.set_status_text('Taking policy samples.')
        pol_samples = [[None] for _ in range(len(self._test_idx))]
        # Since this isn't noisy, just take one sample.
        # TODO: Make this noisy? Add hyperparam?
        # TODO: Take at all conditions for GUI?
        costs = list()
        for cond in range(len(self._test_idx)):
            pol_samples[cond][0] = self.agent.sample(
                self.algorithm.policy_opt.policy,
                self._test_idx[cond],
                verbose=verbose,
                save=False,
                noisy=False)
            policy_cost = self.algorithm.cost[0].eval(pol_samples[cond][0])[0]
            policy_cost = np.sum(policy_cost)
            print "cost: %d" % policy_cost  # wait to plot in gui in gps_training_gui.py
            costs.append(policy_cost)

            ee_points = self.agent.get_ee_point(cond)

        return [SampleList(samples)
                for samples in pol_samples], costs, ee_points
Exemplo n.º 18
0
    def iteration(self, sample_lists):
        """
        Run iteration of LQR.
        Args:
            sample_lists: List of SampleList objects for each condition.
        """
        self.N = sum(len(self.sample_list[i]) for i in self.sample_list.keys())
        for m in range(self.M):
            self.cur[m].sample_list = sample_lists[m]
            prev_samples = self.sample_list[m].get_samples()
            prev_samples.extend(sample_lists[m].get_samples())
            self.sample_list[m] = SampleList(prev_samples)
            self.N += len(sample_lists[m])
        # Update dynamics model using all samples.
        self._update_dynamics()

        # Update the cost during learning if we use IOC.
        if self._hyperparams['ioc']:
            self._update_cost()

        self._update_step_size()  # KL Divergence step size.

        # Run inner loop to compute new policies.
        for _ in range(self._hyperparams['inner_iterations']):
            self._update_trajectories()

        # Computing KL-divergence between sample distribution and demo distribution
        itr = self.iteration_count
        if self._hyperparams['ioc']:
            for i in xrange(self.M):
                mu, sigma = self.traj_opt.forward(self.traj_distr[itr][i], self.traj_info[itr][i])
                # KL divergence between current traj. distribution and gt distribution
                self.kl_div[itr].append(traj_distr_kl(mu, sigma, self.traj_distr[itr][i], self.demo_traj[i]))

        if self._hyperparams['learning_from_prior']:
            for i in xrange(self.M):
                target_position = self._hyperparams['target_end_effector'][:3]
                cur_samples = sample_lists[m].get_samples()
                sample_end_effectors = [cur_samples[i].get(END_EFFECTOR_POINTS) for i in xrange(len(cur_samples))]
                dists = [np.amin(np.sqrt(np.sum((sample_end_effectors[i][:, :3] - target_position.reshape(1, -1))**2, axis = 1)), axis = 0) \
                         for i in xrange(len(cur_samples))]
                self.dists_to_target[itr].append(sum(dists) / len(cur_samples))   
        self._advance_iteration_variables()
Exemplo n.º 19
0
 def _take_policy_samples(self, N=None):
     """
     Take samples from the policy to see how it's doing.
     Args:
         N  : number of policy samples to take per condition
     Returns: None
     """
     if 'verbose_policy_trials' not in self._hyperparams:
         return None
     if not N:
         N = self._hyperparams['verbose_policy_trials']
     if self.gui:
         self.gui.set_status_text('Taking policy samples.')
     pol_samples = [[None for _ in range(N)] for _ in range(self._conditions)]
     for cond in range(len(self._test_idx)):
         for i in range(N):
             pol_samples[cond][i] = self.agent.sample(
                 self.algorithm.policy_opt.policy, self._test_idx[cond],
                 verbose=True, save=False)
     return [SampleList(samples) for samples in pol_samples]
Exemplo n.º 20
0
    def _update_policy_fit(self, m):
        """
        Re-estimate the local policy values in the neighborhood of the
        trajectory.
        Args:
            m: Condition
        """
        dX, dU, T = self.dX, self.dU, self.T
        # Choose samples to use.
        samples = self.cur[m].sample_list
        N = len(samples)
        pol_info = self.cur[m].pol_info
        X = samples.get_X()
        obs = samples.get_obs().copy()
        pol_mu, pol_sig = self.policy_opt.prob(obs)[:2]
        pol_info.pol_mu, pol_info.pol_sig = pol_mu, pol_sig

        # Update policy prior.
        policy_prior = pol_info.policy_prior

        # <gps.algorithm.policy.policy_prior_gmm.PolicyPriorGMM object at 0x7fb26803c690>
        # <bound method PolicyPriorGMM.update of <gps.algorithm.policy.policy_prior_gmm.PolicyPriorGMM object at 0x7fb26803c690>>
        # <bound method PolicyPriorGMM.fit of <gps.algorithm.policy.policy_prior_gmm.PolicyPriorGMM object at 0x7fb26803c690>>

        samples = SampleList(self.cur[m].sample_list)
        mode = self._hyperparams['policy_sample_mode']
        # donde esta este update???
        policy_prior.update(samples, self.policy_opt, mode)

        # Fit linearization and store in pol_info.
        # donde esta este fit????
        pol_info.pol_K, pol_info.pol_k, pol_info.pol_S = \
                policy_prior.fit(X, pol_mu, pol_sig)
        for t in range(T):
            pol_info.chol_pol_S[t, :, :] = \
                    sp.linalg.cholesky(pol_info.pol_S[t, :, :])
Exemplo n.º 21
0
 def get_reset_samples(self, condition, start=0, end=None):
     return (SampleList(self._samples[condition][start:]) if end is None
             else SampleList(self._samples[condition][start:end]))
Exemplo n.º 22
0
	def generate(self):
		"""
		 Generate demos and save them in a file for experiment.
		 Returns: None.
		"""
		# Load the algorithm
		import pickle

		algorithm_file = self._algorithm_files_dir # This should give us the optimal controller. Maybe set to 'controller_itr_%02d.pkl' % itr_load will be better?
		self.algorithm = pickle.load(open(algorithm_file))
		if self.algorithm is None:
			print("Error: cannot find '%s.'" % algorithm_file)
			os._exit(1) # called instead of sys.exit(), since t

		# Keep the initial states of the agent the sames as the demonstrations.
		self._learning = self.ioc_algo._hyperparams['learning_from_prior'] # if the experiment is learning from prior experience
		agent_config = self._hyperparams['demo_agent']
		if agent_config['filename'] == './mjc_models/pr2_arm3d.xml' and not self._learning:
			agent_config['x0'] = self.algorithm._hyperparams['agent_x0']
			agent_config['pos_body_idx'] = self.algorithm._hyperparams['agent_pos_body_idx']
			agent_config['pos_body_offset'] = self.algorithm._hyperparams['agent_pos_body_offset']
		self.agent = agent_config['type'](agent_config)

		# Roll out the demonstrations from controllers
		var_mult = self.algorithm._hyperparams['var_mult']
		T = self.algorithm.T
		demos = []

		M = agent_config['conditions']
		N = self.ioc_algo._hyperparams['num_demos']
		if not self._learning:
			controllers = {}
			good_conds = self.ioc_algo._hyperparams['demo_cond']

			# Store each controller under M conditions into controllers.
			for i in xrange(M):
				controllers[i] = self.algorithm.cur[i].traj_distr
			controllers_var = copy.copy(controllers)
			for i in xrange(M):

				# Increase controller variance.
				controllers_var[i].chol_pol_covar *= var_mult
				# Gather demos.
				for j in xrange(N):
					demo = self.agent.sample(
						controllers_var[i], i,
						verbose=(i < self.algorithm._hyperparams['demo_verbose']),
						save = True
					)
					demos.append(demo)
		else:
			# Extract the neural network policy.
			pol = self.algorithm.policy_opt.policy
			for i in xrange(M):
				# Gather demos.
				demo = self.agent.sample(
					pol, i,
					verbose=(i < self._hyperparams['verbose_trials'])
					)
				demos.append(demo)

		# Filter out worst (M - good_conds) demos.
		target_position = agent_config['target_end_effector'][:3]
		dists_to_target = np.zeros(M)
		for i in xrange(M):
			demo_end_effector = demos[i].get(END_EFFECTOR_POINTS)
			dists_to_target[i] = np.amin(np.sqrt(np.sum((demo_end_effector[:, :3] - target_position.reshape(1, -1))**2, axis = 1)), axis = 0)
		if not self._learning:
			good_indices = dists_to_target.argsort()[:good_conds - M].tolist()
		else:
			good_indicators = (dists_to_target <= agent_config['success_upper_bound']).tolist()
			good_indices = [i for i in xrange(len(good_indicators)) if good_indicators[i]]
			bad_indices = np.argmax(dists_to_target)
			self.ioc_algo._hyperparams['demo_cond'] = len(good_indices)
		filtered_demos = []
		self.ioc_algo.demo_conditions = []
		self.ioc_algo.failed_conditions = []
		exp_dir = self._data_files_dir.replace("data_files", "")
		with open(exp_dir + 'log.txt', 'a') as f:
			f.write('\nThe demo conditions are: \n')
		for i in good_indices:
			filtered_demos.append(demos[i])
			self.ioc_algo.demo_conditions.append(agent_config['pos_body_offset'][i])
			with open(exp_dir + 'log.txt', 'a') as f:
				f.write('\n' + str(agent_config['pos_body_offset'][i]) + '\n')
		with open(exp_dir + 'log.txt', 'a') as f:
			f.write('\nThe failed badmm conditions are: \n')
		for i in xrange(M):
			if i not in good_indices:
				self.ioc_algo.failed_conditions.append(agent_config['pos_body_offset'][i])
				with open(exp_dir + 'log.txt', 'a') as f:
					f.write('\n' + str(agent_config['pos_body_offset'][i]) + '\n')
		# import pdb; pdb.set_trace()
		shuffle(filtered_demos)
		demo_list =  SampleList(filtered_demos)
		demo_store = {'demoX': demo_list.get_X(), 'demoU': demo_list.get_U(), 'demoO': demo_list.get_obs()}
		if self._learning:
			demo_store['pos_body_offset'] = [agent_config['pos_body_offset'][bad_indices]]
		# Save the demos.
		self.data_logger.pickle(
			self._data_files_dir + 'demos.pkl',
			copy.copy(demo_store)
		)
Exemplo n.º 23
0
    def _eval_cost(self, cond, prev_cost=False):
        """
        Evaluate costs for all samples for a condition.
        Args:
            cond: Condition to evaluate cost on.
            prev: Whether or not to use previous_cost (for ioc stepadjust)
        """
        # Constants.
        T, dX, dU = self.T, self.dX, self.dU

        synN = self._hyperparams['synthetic_cost_samples']
        if synN > 0:
            agent = self.cur[cond].sample_list.get_samples()[0].agent
            X, U, _ = self._traj_samples(cond, synN)
            syn_samples = []
            for i in range(synN):
                sample = Sample(agent)
                sample.set_XU(X[i, :, :], U[i, :, :])
                syn_samples.append(sample)
            all_samples = SampleList(syn_samples +
                                     self.cur[cond].sample_list.get_samples())
        else:
            all_samples = self.cur[cond].sample_list
        N = len(all_samples)

        # Compute cost.
        cs = np.zeros((N, T))
        cc = np.zeros((N, T))
        cv = np.zeros((N, T, dX + dU))
        Cm = np.zeros((N, T, dX + dU, dX + dU))
        if self._hyperparams['ioc']:
            cgt = np.zeros((N, T))
        for n in range(N):
            sample = all_samples[n]
            # Get costs.
            if prev_cost:
                l, lx, lu, lxx, luu, lux = self.previous_cost[cond].eval(
                    sample)
            else:
                l, lx, lu, lxx, luu, lux = self.cost[cond].eval(sample)
            # Compute the ground truth cost
            if self._hyperparams['ioc'] and n >= synN:
                l_gt, _, _, _, _, _ = self.gt_cost[cond].eval(sample)
                cgt[n, :] = l_gt
            cc[n, :] = l
            cs[n, :] = l

            # Assemble matrix and vector.
            cv[n, :, :] = np.c_[lx, lu]
            Cm[n, :, :, :] = np.concatenate(
                (np.c_[lxx, np.transpose(lux, [0, 2, 1])], np.c_[lux, luu]),
                axis=1)

            # Adjust for expanding cost around a sample.
            X = sample.get_X()
            U = sample.get_U()
            yhat = np.c_[X, U]
            rdiff = -yhat
            rdiff_expand = np.expand_dims(rdiff, axis=2)
            cv_update = np.sum(Cm[n, :, :, :] * rdiff_expand, axis=1)
            cc[n, :] += np.sum(rdiff * cv[n, :, :], axis=1) + 0.5 * \
                    np.sum(rdiff * cv_update, axis=1)
            cv[n, :, :] += cv_update

        # Fill in cost estimate.
        if prev_cost:
            traj_info = self.cur[cond].prevcost_traj_info
            traj_info.dynamics = self.cur[cond].traj_info.dynamics
            traj_info.x0sigma = self.cur[cond].traj_info.x0sigma
            traj_info.x0mu = self.cur[cond].traj_info.x0mu
        else:
            traj_info = self.cur[cond].traj_info
            self.cur[cond].cs = cs[synN:]  # True value of cost.
        traj_info.cc = np.mean(cc, 0)  # Constant term (scalar).
        traj_info.cv = np.mean(cv, 0)  # Linear term (vector).
        traj_info.Cm = np.mean(Cm, 0)  # Quadratic term (matrix).

        if self._hyperparams['ioc']:
            self.cur[cond].cgt = cgt[synN:]
Exemplo n.º 24
0
    def run(self):
        """Runs training by alternatively taking samples and optimizing the policy."""
        if 'load_model' in self._hyperparams:
            self.iteration_count = self._hyperparams['load_model'][1]
            self.algorithm.policy_opt.iteration_count = self.iteration_count
            self.algorithm.policy_opt.restore_model(
                *self._hyperparams['load_model'])

            # Global policy static resets
            if self._hyperparams['num_pol_samples_static'] > 0:
                self.export_samples(self._take_policy_samples(
                    N=self._hyperparams['num_pol_samples_static'],
                    pol=self.algorithm.policy_opt.policy,
                    rnd=False),
                                    '_pol-static',
                                    visualize=True)

            return

        for itr in range(self._hyperparams['iterations']):
            self.iteration_count = itr
            if hasattr(self.algorithm, 'traj_opt'):
                self.algorithm.traj_opt.iteration_count = itr
            if hasattr(self.algorithm, 'policy_opt'):
                self.algorithm.policy_opt.iteration_count = itr

            print("*** Iteration %02d ***" % itr)
            if itr == 0 and 'load_initial_samples' in self._hyperparams:
                # Load trajectory samples
                print('Loading initial samples ...')
                sample_files = self._hyperparams['load_initial_samples']
                traj_sample_lists = [[] for _ in range(self.algorithm.M)]
                for sample_file in sample_files:
                    data = np.load(sample_file)
                    X, U = data['X'], data['U']
                    assert X.shape[0] == self.algorithm.M
                    for m in range(self.algorithm.M):
                        for n in range(X.shape[1]):
                            traj_sample_lists[m].append(
                                self.agent.pack_sample(X[m, n], U[m, n]))
                traj_sample_lists = [
                    SampleList(traj_samples)
                    for traj_samples in traj_sample_lists
                ]
            else:
                # Take trajectory samples
                with Timer(self.algorithm.timers, 'sampling'):
                    for cond in self._train_idx:
                        for i in trange(self._hyperparams['num_samples'],
                                        desc='Taking samples'):
                            self._take_sample(cond, i)
                traj_sample_lists = [
                    self.agent.get_samples(cond,
                                           -self._hyperparams['num_samples'])
                    for cond in self._train_idx
                ]
            self.export_samples(traj_sample_lists, visualize=True)

            # Iteration
            with Timer(self.algorithm.timers, 'iteration'):
                self.algorithm.iteration(traj_sample_lists, itr)
            self.export_dynamics()
            self.export_controllers()
            self.export_times()
            if hasattr(self.algorithm, 'policy_opt') and hasattr(
                    self.algorithm.policy_opt, 'store_model'):
                self.algorithm.policy_opt.store_model()

            # Sample learned policies for visualization

            # LQR policies static resets
            if self._hyperparams['num_lqr_samples_static'] > 0:
                self.export_samples(self._take_policy_samples(
                    N=self._hyperparams['num_lqr_samples_static'],
                    pol=None,
                    rnd=False),
                                    '_lqr-static',
                                    visualize=True)

            # LQR policies random resets
            if self._hyperparams['num_lqr_samples_random'] > 0:
                self.export_samples(self._take_policy_samples(
                    N=self._hyperparams['num_lqr_samples_random'],
                    pol=None,
                    rnd=True),
                                    '_lqr-random',
                                    visualize=True)

            # LQR policies state noise
            if self._hyperparams['num_lqr_samples_random'] > 0:
                self.export_samples(self._take_policy_samples(
                    N=self._hyperparams['num_lqr_samples_random'],
                    pol=None,
                    rnd=False,
                    randomize_initial_state=24),
                                    '_lqr-static-randomized',
                                    visualize=True)

            if hasattr(self.algorithm, 'policy_opt'):
                # Global policy static resets
                if self._hyperparams['num_pol_samples_static'] > 0:
                    self.export_samples(self._take_policy_samples(
                        N=self._hyperparams['num_pol_samples_static'],
                        pol=self.algorithm.policy_opt.policy,
                        rnd=False),
                                        '_pol-static',
                                        visualize=True)

                # Global policy random resets
                if self._hyperparams['num_pol_samples_random'] > 0:
                    self.export_samples(self._take_policy_samples(
                        N=self._hyperparams['num_pol_samples_random'],
                        pol=self.algorithm.policy_opt.policy,
                        rnd=True),
                                        '_pol-random',
                                        visualize=True)

                # Global policy state noise
                if self._hyperparams['num_pol_samples_random'] > 0:
                    self.export_samples(self._take_policy_samples(
                        N=self._hyperparams['num_pol_samples_random'],
                        pol=self.algorithm.policy_opt.policy,
                        rnd=False,
                        randomize_initial_state=24),
                                        '_pol-static-randomized',
                                        visualize=True)

            self.visualize_training_progress()
Exemplo n.º 25
0
    def generate(self):
        """
		 Generate demos and save them in a file for experiment.
		 Returns: None.
		"""
        # Load the algorithm
        import pickle

        algorithm_file = self._algorithm_files_dir  # This should give us the optimal controller. Maybe set to 'controller_itr_%02d.pkl' % itr_load will be better?
        self.algorithm = pickle.load(open(algorithm_file))
        if self.algorithm is None:
            print("Error: cannot find '%s.'" % algorithm_file)
            os._exit(1)  # called instead of sys.exit(), since t

        # Keep the initial states of the agent the sames as the demonstrations.
        self._learning = self.ioc_algo._hyperparams[
            'learning_from_prior']  # if the experiment is learning from prior experience
        agent_config = self._hyperparams['demo_agent']
        if agent_config[
                'filename'] == './mjc_models/pr2_arm3d.xml' and not self._learning:
            agent_config['x0'] = self.algorithm._hyperparams['agent_x0']
            agent_config['pos_body_idx'] = self.algorithm._hyperparams[
                'agent_pos_body_idx']
            agent_config['pos_body_offset'] = self.algorithm._hyperparams[
                'agent_pos_body_offset']
        self.agent = agent_config['type'](agent_config)

        # Roll out the demonstrations from controllers
        var_mult = self.algorithm._hyperparams['var_mult']
        T = self.algorithm.T
        demos = []

        M = agent_config['conditions']
        N = self.ioc_algo._hyperparams['num_demos']
        if not self._learning:
            controllers = {}
            good_conds = self.ioc_algo._hyperparams['demo_cond']

            # Store each controller under M conditions into controllers.
            for i in xrange(M):
                controllers[i] = self.algorithm.cur[i].traj_distr
            controllers_var = copy.copy(controllers)
            for i in xrange(M):

                # Increase controller variance.
                controllers_var[i].chol_pol_covar *= var_mult
                # Gather demos.
                for j in xrange(N):
                    demo = self.agent.sample(
                        controllers_var[i],
                        i,
                        verbose=(i <
                                 self.algorithm._hyperparams['demo_verbose']),
                        save=True)
                    demos.append(demo)
        else:
            # Extract the neural network policy.
            pol = self.algorithm.policy_opt.policy
            for i in xrange(M):
                # Gather demos.
                demo = self.agent.sample(
                    pol, i, verbose=(i < self._hyperparams['verbose_trials']))
                demos.append(demo)

        # Filter out worst (M - good_conds) demos.
        target_position = agent_config['target_end_effector'][:3]
        dists_to_target = np.zeros(M)
        for i in xrange(M):
            demo_end_effector = demos[i].get(END_EFFECTOR_POINTS)
            dists_to_target[i] = np.amin(np.sqrt(
                np.sum((demo_end_effector[:, :3] -
                        target_position.reshape(1, -1))**2,
                       axis=1)),
                                         axis=0)
        if not self._learning:
            good_indices = dists_to_target.argsort()[:good_conds - M].tolist()
        else:
            good_indicators = (dists_to_target <=
                               agent_config['success_upper_bound']).tolist()
            good_indices = [
                i for i in xrange(len(good_indicators)) if good_indicators[i]
            ]
            bad_indices = np.argmax(dists_to_target)
            self.ioc_algo._hyperparams['demo_cond'] = len(good_indices)
        filtered_demos = []
        self.ioc_algo.demo_conditions = []
        self.ioc_algo.failed_conditions = []
        exp_dir = self._data_files_dir.replace("data_files", "")
        with open(exp_dir + 'log.txt', 'a') as f:
            f.write('\nThe demo conditions are: \n')
        for i in good_indices:
            filtered_demos.append(demos[i])
            self.ioc_algo.demo_conditions.append(
                agent_config['pos_body_offset'][i])
            with open(exp_dir + 'log.txt', 'a') as f:
                f.write('\n' + str(agent_config['pos_body_offset'][i]) + '\n')
        with open(exp_dir + 'log.txt', 'a') as f:
            f.write('\nThe failed badmm conditions are: \n')
        for i in xrange(M):
            if i not in good_indices:
                self.ioc_algo.failed_conditions.append(
                    agent_config['pos_body_offset'][i])
                with open(exp_dir + 'log.txt', 'a') as f:
                    f.write('\n' + str(agent_config['pos_body_offset'][i]) +
                            '\n')
        # import pdb; pdb.set_trace()
        shuffle(filtered_demos)
        demo_list = SampleList(filtered_demos)
        demo_store = {
            'demoX': demo_list.get_X(),
            'demoU': demo_list.get_U(),
            'demoO': demo_list.get_obs()
        }
        if self._learning:
            demo_store['pos_body_offset'] = [
                agent_config['pos_body_offset'][bad_indices]
            ]
        # Save the demos.
        self.data_logger.pickle(self._data_files_dir + 'demos.pkl',
                                copy.copy(demo_store))