Exemplo n.º 1
0
def train_data(condition_data, T, dO, dU):
    pol_info = PolicyInfo({'init_pol_wt': 0.01, 'T': T, 'dU': dU, 'dX': 32})

    obs_data, tgt_mu = np.zeros((0, T, dO)), np.zeros((0, T, dU))
    tgt_prc, tgt_wt = np.zeros((0, T, dU, dU)), np.zeros((0, T))

    # for m in condition_data:
    # samples, traj = condition_data[m]['samples'], condition_data[m]['traj_distr']
    for data in condition_data:
        samples, traj = data['samples'], data['traj_distr']
        X = samples.get_X()
        N = len(samples)
        mu = np.zeros((N, T, dU))
        prc = np.zeros((N, T, dU, dU))
        wt = np.zeros((N, T))
        # Get time-indexed actions.
        for t in range(T):
            # Compute actions along this trajectory.
            prc[:, t, :, :] = np.tile(traj.inv_pol_covar[t, :, :], [N, 1, 1])
            for i in range(N):
                mu[i, t, :] = \
                        (traj.K[t, :, :].dot(X[i, t, :]) + traj.k[t, :]) - \
                        np.linalg.solve(
                            prc[i, t, :, :] / pol_info.pol_wt[t],
                            pol_info.lambda_K[t, :, :].dot(X[i, t, :]) + \
                                    pol_info.lambda_k[t, :]
                        )
            wt[:, t].fill(pol_info.pol_wt[t])
        tgt_mu = np.concatenate((tgt_mu, mu))
        tgt_prc = np.concatenate((tgt_prc, prc))
        tgt_wt = np.concatenate((tgt_wt, wt))
        obs_data = np.concatenate((obs_data, samples.get_obs()))
    return obs_data, tgt_mu, tgt_prc, tgt_wt
Exemplo n.º 2
0
    def __init__(self, hyperparams):
        # ALG_BADMM = {
        #     'inner_iterations': 4,
        #     'policy_dual_rate': 0.1,
        #     'policy_dual_rate_covar': 0.0,
        #     'fixed_lg_step': 0,
        #     'lg_step_schedule': 10.0,
        #     'ent_reg_schedule': 0.0,
        #     'init_pol_wt': 0.01,
        #     'policy_sample_mode': 'add',
        #     'exp_step_increase': 2.0,
        #     'exp_step_decrease': 0.5,
        #     'exp_step_upper': 0.5,
        #     'exp_step_lower': 1.0,
        # }

        # Copy and update parameters
        config = copy.deepcopy(ALG_BADMM)
        # update() adds dictionary dict2's key-values pairs in to dict
        config.update(hyperparams)

        # Initialize (algorithm.py)
        Algorithm.__init__(self, config)

        # algorithm['policy_prior'] = {
        #     'type': PolicyPriorGMM,
        #     'max_clusters': 20,
        #     'min_samples_per_cluster': 40,
        #     'max_samples': 40,
        # }

        policy_prior = self._hyperparams['policy_prior']

        # self._cond_idx = hyperparams['train_conditions']
        # self.M = hyperparams['conditions'] = 2
        for m in range(self.M):
            # self.cur = [IterationData() for _ in range(self.M)]
            # Initialize policy information
            self.cur[m].pol_info = PolicyInfo(self._hyperparams)
            self.cur[m].pol_info.policy_prior = \
                    policy_prior['type'](policy_prior)

        # algorithm['policy_opt'] = {
        #     'type': PolicyOptTf,
        #     'network_params': {
        #         'obs_include': [JOINT_ANGLES, JOINT_VELOCITIES],
        #         'obs_vector_data': [JOINT_ANGLES, JOINT_VELOCITIES],
        #         'sensor_dims': SENSOR_DIMS,
        #     },
        #     'network_model': tf_network,
        #     'iterations': 1000,
        #     'weights_file_prefix': EXP_DIR + 'policy',
        # }
        self.policy_opt = self._hyperparams['policy_opt']['type'](
            self._hyperparams['policy_opt'], self.dO, self.dU)
Exemplo n.º 3
0
    def __init__(self, hyperparams):
        config = copy.deepcopy(ALG_BADMM)
        config.update(hyperparams)
        Algorithm.__init__(self, config)

        policy_prior = self._hyperparams['policy_prior']
        for m in range(self.M):
            self.cur[m].pol_info = PolicyInfo(self._hyperparams)
            self.cur[m].pol_info.policy_prior = \
                    policy_prior['type'](policy_prior)

        self.policy_opt = self._hyperparams['policy_opt']['type'](
            self._hyperparams['policy_opt'], self.dO, self.dU)
Exemplo n.º 4
0
    def __init__(self, hyperparams):
        config = copy.deepcopy(ALG_OLGPS)
        config.update(hyperparams)
        Algorithm.__init__(self, config)
        self.policy_opt = self._hyperparams['policy_opt']['type'](
            self._hyperparams['policy_opt'], self.dO, self.dU
        )
        self.flag_reset = False

        policy_prior = self._hyperparams['policy_prior']
        for m in range(self.M):
            self.cur[m].last_pol = PolicyInfo(self._hyperparams)
            self.cur[m].last_pol.policy_prior = \
                    policy_prior['type'](policy_prior)
Exemplo n.º 5
0
    def __init__(self, hyperparams):
        config = copy.deepcopy(ALG)
        config.update(hyperparams)
        self._hyperparams = config

        if 'train_conditions' in hyperparams:
            self._cond_idx = hyperparams['train_conditions']
            self.M = len(self._cond_idx)
        else:
            self.M = hyperparams['conditions']
            self._cond_idx = range(self.M)
            self._hyperparams['train_conditions'] = self._cond_idx
            self._hyperparams['test_conditions'] = self._cond_idx
        self.iteration_count = 0

        # Grab a few values from the agent.
        agent = self._hyperparams['agent']
        #print(agent)
        self.agent = agent

        self.T = self._hyperparams['T'] = agent.T
        self.dU = self._hyperparams['dU'] = agent.dU
        self.dX = self._hyperparams['dX'] = agent.dX
        self.dO = self._hyperparams['dO'] = agent.dO

        init_traj_distr = config['init_traj_distr']
        init_traj_distr['x0'] = agent.x0
        init_traj_distr['dX'] = agent.dX
        init_traj_distr['dU'] = agent.dU
        del self._hyperparams['agent']  # Don't want to pickle this.

        # IterationData objects for each condition.
        self.cur = [IterationData() for _ in range(self.M)]
        self.prev = [IterationData() for _ in range(self.M)]

        dynamics = self._hyperparams['dynamics']
        for m in range(self.M):
            self.cur[m].traj_info = TrajectoryInfo()
            self.cur[m].traj_info.dynamics = dynamics['type'](dynamics)
            cur_init_traj_distr = extract_condition(init_traj_distr,
                                                    self._cond_idx[m])
            cur_init_traj_distr['cur_cond_idx'] = self._cond_idx[m]
            #print(cur_init_traj_distr)
            self.cur[m].traj_distr = cur_init_traj_distr['type'](
                cur_init_traj_distr, agent)

        self.traj_opt = hyperparams['traj_opt']['type'](
            hyperparams['traj_opt'])
        self.cost = []
        for m in range(self.M):
            cost_hyperparams = hyperparams['cost'].copy()
            cost_hyperparams['cur_cond_idx'] = self._cond_idx[m]
            self.cost.append(hyperparams['cost']['type'](cost_hyperparams))

        self.base_kl_step = self._hyperparams['kl_step']

        policy_prior = self._hyperparams['policy_prior']
        for m in range(self.M):
            self.cur[m].pol_info = PolicyInfo(self._hyperparams)
            self.cur[m].pol_info.policy_prior = \
                    policy_prior['type'](policy_prior)

        self.policy_opt = self._hyperparams['policy_opt']['type'](
            self._hyperparams['policy_opt'], self.dO, self.dU)
Exemplo n.º 6
0
 def re_init_pol_info(self, hyperparams):
     policy_prior = self._hyperparams['policy_prior']
     for m in range(self.M):
         self.cur[m].pol_info = PolicyInfo(self._hyperparams)
         self.cur[m].pol_info.policy_prior = \
                 policy_prior['type'](policy_prior)