def _update_policy_fit(self, m, init=False): """ Re-estimate the local policy values in the neighborhood of the trajectory. Args: m: Condition init: Whether this is the initial fitting of the policy. """ dX, dU, T = self.dX, self.dU, self.T # Choose samples to use. samples = self.cur[m].sample_list N = len(samples) pol_info = self.cur[m].pol_info X = samples.get_X() obs = samples.get_obs().copy() pol_mu, pol_sig = self.policy_opt.prob(obs)[:2] pol_info.pol_mu, pol_info.pol_sig = pol_mu, pol_sig # Update policy prior. policy_prior = pol_info.policy_prior if init: samples = SampleList(self.cur[m].sample_list) mode = self._hyperparams['policy_sample_mode'] else: samples = SampleList([]) mode = 'add' # Don't replace with empty samples policy_prior.update(samples, self.policy_opt, mode) # Fit linearization and store in pol_info. pol_info.pol_K, pol_info.pol_k, pol_info.pol_S = \ policy_prior.fit(X, pol_mu, pol_sig) for t in range(T): pol_info.chol_pol_S[t, :, :] = \ sp.linalg.cholesky(pol_info.pol_S[t, :, :])
def get_samples(self, condition, start=0, end=None): """ Return the requested samples based on the start and end indices. Args: start: Starting index of samples to return. end: End index of samples to return. """ return (SampleList(self._samples[condition][start:]) if end is None else SampleList(self._samples[condition][start:end]))
def _update_policy_fit(self, m, init=False): """ Re-estimate the local policy values in the neighborhood of the trajectory. Args: m: Condition init: Whether this is the initial fitting of the policy. """ dX, dU, T = self.dX, self.dU, self.T # Choose samples to use. samples = self.cur[m].sample_list N = len(samples) pol_info = self.cur[m].pol_info X = samples.get_X() pol_mu, pol_sig = self.policy_opt.prob(samples.get_obs().copy())[:2] pol_info.pol_mu, pol_info.pol_sig = pol_mu, pol_sig # Update policy prior. if init: self.cur[m].pol_info.policy_prior.update( samples, self.policy_opt, SampleList(self.cur[m].pol_info.policy_samples) ) else: self.cur[m].pol_info.policy_prior.update( SampleList([]), self.policy_opt, SampleList(self.cur[m].pol_info.policy_samples) ) # Collapse policy covariances. This is not really correct, but # it works fine so long as the policy covariance doesn't depend # on state. pol_sig = np.mean(pol_sig, axis=0) # Estimate the policy linearization at each time step. for t in range(T): # Assemble diagonal weights matrix and data. dwts = (1.0 / N) * np.ones(N) Ts = X[:, t, :] Ps = pol_mu[:, t, :] Ys = np.concatenate((Ts, Ps), axis=1) # Obtain Normal-inverse-Wishart prior. mu0, Phi, mm, n0 = self.cur[m].pol_info.policy_prior.eval(Ts, Ps) sig_reg = np.zeros((dX+dU, dX+dU)) # On the first time step, always slightly regularize covariance. if t == 0: sig_reg[:dX, :dX] = 1e-8 * np.eye(dX) # Perform computation. pol_K, pol_k, pol_S = gauss_fit_joint_prior(Ys, mu0, Phi, mm, n0, dwts, dX, dU, sig_reg) pol_S += pol_sig[t, :, :] pol_info.pol_K[t, :, :], pol_info.pol_k[t, :] = pol_K, pol_k pol_info.pol_S[t, :, :], pol_info.chol_pol_S[t, :, :] = \ pol_S, sp.linalg.cholesky(pol_S)
def _update_policy_fit(self, m): """ Re-estimate the local policy values in the neighborhood of the trajectory. Args: m: Condition """ dX, dU, T = self.dX, self.dU, self.T # Choose samples to use. samples = self.cur[m].sample_list N = len(samples) pol_info = self.cur[m].pol_info X = samples.get_X() obs = samples.get_obs().copy() pol_mu, pol_sig = self.policy_opt.prob(obs)[:2] pol_info.pol_mu, pol_info.pol_sig = pol_mu, pol_sig # Update policy prior. policy_prior = pol_info.policy_prior samples = SampleList(self.cur[m].sample_list) mode = self._hyperparams['policy_sample_mode'] print "mode: ", mode policy_prior.update(samples, self.policy_opt, mode) # Fit linearization and store in pol_info. pol_info.pol_K, pol_info.pol_k, pol_info.pol_S = \ policy_prior.fit(X, pol_mu, pol_sig) for t in range(T): pol_info.chol_pol_S[t, :, :] = \ sp.linalg.cholesky(pol_info.pol_S[t, :, :])
def _test_policy_samples(self, N=None): """ test sample from the policy and collect the costs Args: N: Returns: samples costs: list of cost for each condition ee_point: list of ee_point for each condition """ if 'verbose_policy_trials' not in self._hyperparams: return None verbose = self._hyperparams['verbose_policy_trials'] pol_samples = [[None] for _ in range(len(self._test_idx))] costs = list() ee_points = list() for cond in range(len(self._test_idx)): pol_samples[cond][0] = self.agent.sample( self.algorithm.policy_opt.policy, self._test_idx[cond], verbose=verbose, save=False, noisy=False) # in algorithm.py: _eval_cost policy_cost = self.algorithm.cost[0].eval(pol_samples[cond][0])[0] policy_cost = np.sum(policy_cost) #100 step costs.append(policy_cost) ee_points.append(self.agent.get_ee_point(cond)) return [SampleList(samples) for samples in pol_samples], costs, ee_points
def take_policy_samples(agent, policy, conditions, n): return [ SampleList([ agent.sample(policy, cond, save=False, noisy=False) for _ in range(n) ]) for cond in range(conditions) ]
def _take_policy_samples(self, N=None): """ Take samples from the policy to see how it's doing. Args: N : number of policy samples to take per condition Returns: None """ print( " ================================ test policy ====================================" ) if 'verbose_policy_trials' not in self._hyperparams: # AlgorithmTrajOpt return None verbose = self._hyperparams['verbose_policy_trials'] if self.gui: self.gui.set_status_text('Taking policy samples.') pol_samples = [[None] for _ in range(len(self._test_idx))] # Since this isn't noisy, just take one sample. # TODO: Make this noisy? Add hyperparam? # TODO: Take at all conditions for GUI? for cond in range(len(self._test_idx)): pol_samples[cond][0] = self.agent.sample( self.algorithm.policy_opt.policy, self._test_idx[cond], verbose=verbose, save=False, noisy=False) return [SampleList(samples) for samples in pol_samples]
def _take_policy_samples(self, N=None): """ Take samples from the policy to see how it's doing. Args: N : number of policy samples to take per condition Returns: None """ if 'verbose_policy_trials' not in self._hyperparams: # AlgorithmTrajOpt return None verbose = self._hyperparams[ 'verbose_policy_trials'] # bool as to whether to use verbose trails or not if self.gui: self.gui.set_status_text('Taking policy samples.') pol_samples = [[None] for _ in range(len(self._test_idx))] # Since this isn't noisy, just take one sample. # TODO: Make this noisy? Add hyperparam? # TODO: Take at all conditions for GUI? for cond in range(len(self._test_idx)): pol_samples[cond][0] = self.agent.sample( self.algorithm.policy_opt.policy, self._test_idx[cond], verbose=verbose, save=False, noisy=False ) # iterate through problem instantiations, accumulating policy samples from LQR policy return [SampleList(samples) for samples in pol_samples ] # return samples, held in SampleList objects
def _take_policy_samples(self, N, pol, rnd=False, randomize_initial_state=0): """Takes samples from the policy without exploration noise. Args: N: number of policy samples to take per condition. pol: Policy to sample. Specify `None` to use sample local LQR policies. rnd: Use random reset states. randomize_initial_state: Randomize initial state. """ if pol is None: pol_samples = [[None] * N] * len(self._test_idx) for i, cond in enumerate(self._test_idx, 0): for n in trange( N, desc='Taking LQR-policy samples m=%d, cond=%s' % (cond, 'rnd' if rnd else cond)): pol_samples[i][n] = self.agent.sample( self.algorithm.cur[cond].traj_distr, None, save=False, noisy=False, reset_cond=None if rnd else cond, randomize_initial_state=randomize_initial_state, record=False) return [SampleList(samples) for samples in pol_samples] else: conds = self._test_idx if not rnd else [None] # stores where the policy has lead to pol_samples = [[None] * N] * len(conds) for i, cond in enumerate(conds): for n in trange(N, desc='Taking %s policy samples cond=%s' % (type(pol).__name__, 'rnd' if rnd else cond)): pol_samples[i][n] = self.agent.sample( pol, None, save=False, noisy=False, reset_cond=cond, randomize_initial_state=randomize_initial_state, record=n < 0) return [SampleList(samples) for samples in pol_samples]
def __init__(self, hyperparams): config = copy.deepcopy(ALG) config.update(hyperparams) self._hyperparams = config if 'train_conditions' in hyperparams: self._cond_idx = hyperparams['train_conditions'] self.M = len(self._cond_idx) else: self.M = hyperparams['conditions'] self._cond_idx = range(self.M) self.iteration_count = 0 # Grab a few values from the agent. agent = self._hyperparams['agent'] self.T = self._hyperparams['T'] = agent.T self.dU = self._hyperparams['dU'] = agent.dU self.dX = self._hyperparams['dX'] = agent.dX self.dO = self._hyperparams['dO'] = agent.dO init_traj_distr = config['init_traj_distr'] init_traj_distr['x0'] = agent.x0 init_traj_distr['dX'] = agent.dX init_traj_distr['dU'] = agent.dU del self._hyperparams['agent'] # Don't want to pickle this. # IterationData objects for each condition. self.cur = [IterationData() for _ in range(self.M)] self.prev = [IterationData() for _ in range(self.M)] self.traj_distr = {self.iteration_count: []} self.traj_info = {self.iteration_count: []} self.kl_div = {self.iteration_count: []} self.dists_to_target = {self.iteration_count: []} self.sample_list = {i: SampleList([]) for i in range(self.M)} for m in range(self.M): self.cur[m].traj_info = TrajectoryInfo() dynamics = self._hyperparams['dynamics'] self.cur[m].traj_info.dynamics = dynamics['type'](dynamics) init_traj_distr = extract_condition( self._hyperparams['init_traj_distr'], self._cond_idx[m]) self.cur[m].traj_distr = init_traj_distr['type'](init_traj_distr) self.traj_distr[self.iteration_count].append( self.cur[m].traj_distr) self.traj_info[self.iteration_count].append(self.cur[m].traj_info) self.traj_opt = hyperparams['traj_opt']['type']( hyperparams['traj_opt']) self.cost = [ hyperparams['cost']['type'](hyperparams['cost']) for _ in range(self.M) ] if self._hyperparams['ioc']: self.gt_cost = [ hyperparams['gt_cost']['type'](hyperparams['gt_cost']) for _ in range(self.M) ] self.base_kl_step = self._hyperparams['kl_step']
def _assign_samples(self, samples, responsibilities): """ Assigns samples to clusters by their responsibilities. """ for m in range(self.M): self.cur[m].sample_list = SampleList([ samples[i] for i in range(self.N) if responsibilities[i] == m ])
def _take_policy_samples(self, cond_list): pol_samples = [[] for _ in range(len(cond_list))] for cond in range(len(cond_list)): for i in range(self._hyperparams['num_samples']): pol_samples[cond].append( self.agent.sample(self.algorithm.policy_opt.policy, cond_list[cond], save=False)) return [SampleList(samples) for samples in pol_samples]
def iteration(itr, cfg, agent, algorithm): conditions = len(cfg['common']['train_conditions']) samples_per_cond = cfg['num_samples'] sample_lists = [] for cond in range(conditions): policy = algorithm.cur[cond].traj_distr sample_list = SampleList([agent.sample(policy, cond, save=False, noisy=True) for _ in range(samples_per_cond)]) sample_lists.append(sample_list) algorithm.iteration(sample_lists)
def _take_policy_samples(self, N=None, test_policy=False): """ Take samples from the policy to see how it's doing. Args: N : number of policy samples to take per condition Returns: None """ if not N: N = self._hyperparams['verbose_policy_trials'] if self.gui: self.gui.set_status_text('Taking policy samples.') verbose = self._hyperparams['verbose_policy_trials'] if self.gui: self.gui.set_status_text('Taking policy samples.') pol_samples = [[None] for _ in range(len(self._test_idx))] # Since this isn't noisy, just take one sample. for cond in range(len(self._test_idx)): extra_args = {} if type(self.agent) == AgentSUPERball: extra_args = { 'superball_parameters': { 'reset': ('reset' not in self._hyperparams['agent'] or self._hyperparams['agent']['reset'][cond]), 'relax': ('relax' in self._hyperparams['agent'] and self._hyperparams['agent']['relax'][cond]), 'bottom_face': (None if 'bottom_faces' not in self._hyperparams['agent'] else self._hyperparams['agent']['bottom_faces'][cond]), 'horizon': (None if (not test_policy) or 'policy_test_horizon' not in self._hyperparams['agent'] else self._hyperparams['agent']['policy_test_horizon']), 'start_motor_positions': (None if 'start_motor_positions' not in self._hyperparams['agent'] else self. _hyperparams['agent']['start_motor_positions'][cond]), 'motor_position_control_gain': (None if 'motor_position_control_gain' not in self._hyperparams['agent'] else self._hyperparams['agent'] ['motor_position_control_gain'][cond]), 'debug': False, } } pol_samples[cond][0] = self.agent.sample( self.algorithm.policy_opt.policy, self._test_idx[cond], verbose=verbose, save=False, noisy=False, **extra_args) return [SampleList(samples) for samples in pol_samples]
def _take_policy_samples(self, N, pol, rnd=False): """ Take samples from the policy to see how it's doing. Args: N : number of policy samples to take per condition pol: Policy to sample. None for LQR policies. Returns: None """ if pol is None: pol_samples = [[None] * N] * len(self._test_idx) for i, cond in enumerate(self._test_idx, 0): for n in trange( N, desc='Taking LQR-policy samples m=%d, cond=%s' % (cond, 'rnd' if rnd else cond)): pol_samples[i][n] = self.agent.sample( self.algorithm.cur[cond].traj_distr, None, verbose=None, save=False, noisy=False, reset_cond=None if rnd else cond, record=False) return [SampleList(samples) for samples in pol_samples] else: conds = self._test_idx if not rnd else [None] # stores where the policy has lead to pol_samples = [[None] * N] * len(conds) for i, cond in enumerate(conds): for n in trange(N, desc='Taking %s policy samples cond=%s' % (type(pol).__name__, 'rnd' if rnd else cond)): pol_samples[i][n] = self.agent.sample(pol, None, verbose=None, save=False, noisy=False, reset_cond=cond, record=n < 0) return [SampleList(samples) for samples in pol_samples]
def _take_policy_samples(self, cond_list, guided_steps=0, t_length=50): pol_samples = [[] for _ in range(len(cond_list))] for cond in range(len(cond_list)): for i in range(self._hyperparams['num_samples']): pol_samples[cond].append( self.agent.sample( self.algorithm.policy_opt.policy, cond_list[cond], start_policy=self.algorithm.cur[cond].traj_distr, save=False, ltorun=True, guided_steps=guided_steps, t_length=t_length)) return [SampleList(samples) for samples in pol_samples]
def take_nn_samples(self, N=None): """ take the NN policy Args: N: Returns: samples, costs, ee_points """ """ Take samples from the policy to see how it's doing. Args: N : number of policy samples to take per condition Returns: None """ if 'verbose_policy_trials' not in self._hyperparams: # AlgorithmTrajOpt return None verbose = self._hyperparams['verbose_policy_trials'] if self.gui: self.gui.set_status_text('Taking policy samples.') pol_samples = [[None] for _ in range(len(self._test_idx))] # Since this isn't noisy, just take one sample. # TODO: Make this noisy? Add hyperparam? # TODO: Take at all conditions for GUI? costs = list() for cond in range(len(self._test_idx)): pol_samples[cond][0] = self.agent.sample( self.algorithm.policy_opt.policy, self._test_idx[cond], verbose=verbose, save=False, noisy=False) policy_cost = self.algorithm.cost[0].eval(pol_samples[cond][0])[0] policy_cost = np.sum(policy_cost) print "cost: %d" % policy_cost # wait to plot in gui in gps_training_gui.py costs.append(policy_cost) ee_points = self.agent.get_ee_point(cond) return [SampleList(samples) for samples in pol_samples], costs, ee_points
def iteration(self, sample_lists): """ Run iteration of LQR. Args: sample_lists: List of SampleList objects for each condition. """ self.N = sum(len(self.sample_list[i]) for i in self.sample_list.keys()) for m in range(self.M): self.cur[m].sample_list = sample_lists[m] prev_samples = self.sample_list[m].get_samples() prev_samples.extend(sample_lists[m].get_samples()) self.sample_list[m] = SampleList(prev_samples) self.N += len(sample_lists[m]) # Update dynamics model using all samples. self._update_dynamics() # Update the cost during learning if we use IOC. if self._hyperparams['ioc']: self._update_cost() self._update_step_size() # KL Divergence step size. # Run inner loop to compute new policies. for _ in range(self._hyperparams['inner_iterations']): self._update_trajectories() # Computing KL-divergence between sample distribution and demo distribution itr = self.iteration_count if self._hyperparams['ioc']: for i in xrange(self.M): mu, sigma = self.traj_opt.forward(self.traj_distr[itr][i], self.traj_info[itr][i]) # KL divergence between current traj. distribution and gt distribution self.kl_div[itr].append(traj_distr_kl(mu, sigma, self.traj_distr[itr][i], self.demo_traj[i])) if self._hyperparams['learning_from_prior']: for i in xrange(self.M): target_position = self._hyperparams['target_end_effector'][:3] cur_samples = sample_lists[m].get_samples() sample_end_effectors = [cur_samples[i].get(END_EFFECTOR_POINTS) for i in xrange(len(cur_samples))] dists = [np.amin(np.sqrt(np.sum((sample_end_effectors[i][:, :3] - target_position.reshape(1, -1))**2, axis = 1)), axis = 0) \ for i in xrange(len(cur_samples))] self.dists_to_target[itr].append(sum(dists) / len(cur_samples)) self._advance_iteration_variables()
def _take_policy_samples(self, N=None): """ Take samples from the policy to see how it's doing. Args: N : number of policy samples to take per condition Returns: None """ if 'verbose_policy_trials' not in self._hyperparams: return None if not N: N = self._hyperparams['verbose_policy_trials'] if self.gui: self.gui.set_status_text('Taking policy samples.') pol_samples = [[None for _ in range(N)] for _ in range(self._conditions)] for cond in range(len(self._test_idx)): for i in range(N): pol_samples[cond][i] = self.agent.sample( self.algorithm.policy_opt.policy, self._test_idx[cond], verbose=True, save=False) return [SampleList(samples) for samples in pol_samples]
def _update_policy_fit(self, m): """ Re-estimate the local policy values in the neighborhood of the trajectory. Args: m: Condition """ dX, dU, T = self.dX, self.dU, self.T # Choose samples to use. samples = self.cur[m].sample_list N = len(samples) pol_info = self.cur[m].pol_info X = samples.get_X() obs = samples.get_obs().copy() pol_mu, pol_sig = self.policy_opt.prob(obs)[:2] pol_info.pol_mu, pol_info.pol_sig = pol_mu, pol_sig # Update policy prior. policy_prior = pol_info.policy_prior # <gps.algorithm.policy.policy_prior_gmm.PolicyPriorGMM object at 0x7fb26803c690> # <bound method PolicyPriorGMM.update of <gps.algorithm.policy.policy_prior_gmm.PolicyPriorGMM object at 0x7fb26803c690>> # <bound method PolicyPriorGMM.fit of <gps.algorithm.policy.policy_prior_gmm.PolicyPriorGMM object at 0x7fb26803c690>> samples = SampleList(self.cur[m].sample_list) mode = self._hyperparams['policy_sample_mode'] # donde esta este update??? policy_prior.update(samples, self.policy_opt, mode) # Fit linearization and store in pol_info. # donde esta este fit???? pol_info.pol_K, pol_info.pol_k, pol_info.pol_S = \ policy_prior.fit(X, pol_mu, pol_sig) for t in range(T): pol_info.chol_pol_S[t, :, :] = \ sp.linalg.cholesky(pol_info.pol_S[t, :, :])
def get_reset_samples(self, condition, start=0, end=None): return (SampleList(self._samples[condition][start:]) if end is None else SampleList(self._samples[condition][start:end]))
def generate(self): """ Generate demos and save them in a file for experiment. Returns: None. """ # Load the algorithm import pickle algorithm_file = self._algorithm_files_dir # This should give us the optimal controller. Maybe set to 'controller_itr_%02d.pkl' % itr_load will be better? self.algorithm = pickle.load(open(algorithm_file)) if self.algorithm is None: print("Error: cannot find '%s.'" % algorithm_file) os._exit(1) # called instead of sys.exit(), since t # Keep the initial states of the agent the sames as the demonstrations. self._learning = self.ioc_algo._hyperparams['learning_from_prior'] # if the experiment is learning from prior experience agent_config = self._hyperparams['demo_agent'] if agent_config['filename'] == './mjc_models/pr2_arm3d.xml' and not self._learning: agent_config['x0'] = self.algorithm._hyperparams['agent_x0'] agent_config['pos_body_idx'] = self.algorithm._hyperparams['agent_pos_body_idx'] agent_config['pos_body_offset'] = self.algorithm._hyperparams['agent_pos_body_offset'] self.agent = agent_config['type'](agent_config) # Roll out the demonstrations from controllers var_mult = self.algorithm._hyperparams['var_mult'] T = self.algorithm.T demos = [] M = agent_config['conditions'] N = self.ioc_algo._hyperparams['num_demos'] if not self._learning: controllers = {} good_conds = self.ioc_algo._hyperparams['demo_cond'] # Store each controller under M conditions into controllers. for i in xrange(M): controllers[i] = self.algorithm.cur[i].traj_distr controllers_var = copy.copy(controllers) for i in xrange(M): # Increase controller variance. controllers_var[i].chol_pol_covar *= var_mult # Gather demos. for j in xrange(N): demo = self.agent.sample( controllers_var[i], i, verbose=(i < self.algorithm._hyperparams['demo_verbose']), save = True ) demos.append(demo) else: # Extract the neural network policy. pol = self.algorithm.policy_opt.policy for i in xrange(M): # Gather demos. demo = self.agent.sample( pol, i, verbose=(i < self._hyperparams['verbose_trials']) ) demos.append(demo) # Filter out worst (M - good_conds) demos. target_position = agent_config['target_end_effector'][:3] dists_to_target = np.zeros(M) for i in xrange(M): demo_end_effector = demos[i].get(END_EFFECTOR_POINTS) dists_to_target[i] = np.amin(np.sqrt(np.sum((demo_end_effector[:, :3] - target_position.reshape(1, -1))**2, axis = 1)), axis = 0) if not self._learning: good_indices = dists_to_target.argsort()[:good_conds - M].tolist() else: good_indicators = (dists_to_target <= agent_config['success_upper_bound']).tolist() good_indices = [i for i in xrange(len(good_indicators)) if good_indicators[i]] bad_indices = np.argmax(dists_to_target) self.ioc_algo._hyperparams['demo_cond'] = len(good_indices) filtered_demos = [] self.ioc_algo.demo_conditions = [] self.ioc_algo.failed_conditions = [] exp_dir = self._data_files_dir.replace("data_files", "") with open(exp_dir + 'log.txt', 'a') as f: f.write('\nThe demo conditions are: \n') for i in good_indices: filtered_demos.append(demos[i]) self.ioc_algo.demo_conditions.append(agent_config['pos_body_offset'][i]) with open(exp_dir + 'log.txt', 'a') as f: f.write('\n' + str(agent_config['pos_body_offset'][i]) + '\n') with open(exp_dir + 'log.txt', 'a') as f: f.write('\nThe failed badmm conditions are: \n') for i in xrange(M): if i not in good_indices: self.ioc_algo.failed_conditions.append(agent_config['pos_body_offset'][i]) with open(exp_dir + 'log.txt', 'a') as f: f.write('\n' + str(agent_config['pos_body_offset'][i]) + '\n') # import pdb; pdb.set_trace() shuffle(filtered_demos) demo_list = SampleList(filtered_demos) demo_store = {'demoX': demo_list.get_X(), 'demoU': demo_list.get_U(), 'demoO': demo_list.get_obs()} if self._learning: demo_store['pos_body_offset'] = [agent_config['pos_body_offset'][bad_indices]] # Save the demos. self.data_logger.pickle( self._data_files_dir + 'demos.pkl', copy.copy(demo_store) )
def _eval_cost(self, cond, prev_cost=False): """ Evaluate costs for all samples for a condition. Args: cond: Condition to evaluate cost on. prev: Whether or not to use previous_cost (for ioc stepadjust) """ # Constants. T, dX, dU = self.T, self.dX, self.dU synN = self._hyperparams['synthetic_cost_samples'] if synN > 0: agent = self.cur[cond].sample_list.get_samples()[0].agent X, U, _ = self._traj_samples(cond, synN) syn_samples = [] for i in range(synN): sample = Sample(agent) sample.set_XU(X[i, :, :], U[i, :, :]) syn_samples.append(sample) all_samples = SampleList(syn_samples + self.cur[cond].sample_list.get_samples()) else: all_samples = self.cur[cond].sample_list N = len(all_samples) # Compute cost. cs = np.zeros((N, T)) cc = np.zeros((N, T)) cv = np.zeros((N, T, dX + dU)) Cm = np.zeros((N, T, dX + dU, dX + dU)) if self._hyperparams['ioc']: cgt = np.zeros((N, T)) for n in range(N): sample = all_samples[n] # Get costs. if prev_cost: l, lx, lu, lxx, luu, lux = self.previous_cost[cond].eval( sample) else: l, lx, lu, lxx, luu, lux = self.cost[cond].eval(sample) # Compute the ground truth cost if self._hyperparams['ioc'] and n >= synN: l_gt, _, _, _, _, _ = self.gt_cost[cond].eval(sample) cgt[n, :] = l_gt cc[n, :] = l cs[n, :] = l # Assemble matrix and vector. cv[n, :, :] = np.c_[lx, lu] Cm[n, :, :, :] = np.concatenate( (np.c_[lxx, np.transpose(lux, [0, 2, 1])], np.c_[lux, luu]), axis=1) # Adjust for expanding cost around a sample. X = sample.get_X() U = sample.get_U() yhat = np.c_[X, U] rdiff = -yhat rdiff_expand = np.expand_dims(rdiff, axis=2) cv_update = np.sum(Cm[n, :, :, :] * rdiff_expand, axis=1) cc[n, :] += np.sum(rdiff * cv[n, :, :], axis=1) + 0.5 * \ np.sum(rdiff * cv_update, axis=1) cv[n, :, :] += cv_update # Fill in cost estimate. if prev_cost: traj_info = self.cur[cond].prevcost_traj_info traj_info.dynamics = self.cur[cond].traj_info.dynamics traj_info.x0sigma = self.cur[cond].traj_info.x0sigma traj_info.x0mu = self.cur[cond].traj_info.x0mu else: traj_info = self.cur[cond].traj_info self.cur[cond].cs = cs[synN:] # True value of cost. traj_info.cc = np.mean(cc, 0) # Constant term (scalar). traj_info.cv = np.mean(cv, 0) # Linear term (vector). traj_info.Cm = np.mean(Cm, 0) # Quadratic term (matrix). if self._hyperparams['ioc']: self.cur[cond].cgt = cgt[synN:]
def run(self): """Runs training by alternatively taking samples and optimizing the policy.""" if 'load_model' in self._hyperparams: self.iteration_count = self._hyperparams['load_model'][1] self.algorithm.policy_opt.iteration_count = self.iteration_count self.algorithm.policy_opt.restore_model( *self._hyperparams['load_model']) # Global policy static resets if self._hyperparams['num_pol_samples_static'] > 0: self.export_samples(self._take_policy_samples( N=self._hyperparams['num_pol_samples_static'], pol=self.algorithm.policy_opt.policy, rnd=False), '_pol-static', visualize=True) return for itr in range(self._hyperparams['iterations']): self.iteration_count = itr if hasattr(self.algorithm, 'traj_opt'): self.algorithm.traj_opt.iteration_count = itr if hasattr(self.algorithm, 'policy_opt'): self.algorithm.policy_opt.iteration_count = itr print("*** Iteration %02d ***" % itr) if itr == 0 and 'load_initial_samples' in self._hyperparams: # Load trajectory samples print('Loading initial samples ...') sample_files = self._hyperparams['load_initial_samples'] traj_sample_lists = [[] for _ in range(self.algorithm.M)] for sample_file in sample_files: data = np.load(sample_file) X, U = data['X'], data['U'] assert X.shape[0] == self.algorithm.M for m in range(self.algorithm.M): for n in range(X.shape[1]): traj_sample_lists[m].append( self.agent.pack_sample(X[m, n], U[m, n])) traj_sample_lists = [ SampleList(traj_samples) for traj_samples in traj_sample_lists ] else: # Take trajectory samples with Timer(self.algorithm.timers, 'sampling'): for cond in self._train_idx: for i in trange(self._hyperparams['num_samples'], desc='Taking samples'): self._take_sample(cond, i) traj_sample_lists = [ self.agent.get_samples(cond, -self._hyperparams['num_samples']) for cond in self._train_idx ] self.export_samples(traj_sample_lists, visualize=True) # Iteration with Timer(self.algorithm.timers, 'iteration'): self.algorithm.iteration(traj_sample_lists, itr) self.export_dynamics() self.export_controllers() self.export_times() if hasattr(self.algorithm, 'policy_opt') and hasattr( self.algorithm.policy_opt, 'store_model'): self.algorithm.policy_opt.store_model() # Sample learned policies for visualization # LQR policies static resets if self._hyperparams['num_lqr_samples_static'] > 0: self.export_samples(self._take_policy_samples( N=self._hyperparams['num_lqr_samples_static'], pol=None, rnd=False), '_lqr-static', visualize=True) # LQR policies random resets if self._hyperparams['num_lqr_samples_random'] > 0: self.export_samples(self._take_policy_samples( N=self._hyperparams['num_lqr_samples_random'], pol=None, rnd=True), '_lqr-random', visualize=True) # LQR policies state noise if self._hyperparams['num_lqr_samples_random'] > 0: self.export_samples(self._take_policy_samples( N=self._hyperparams['num_lqr_samples_random'], pol=None, rnd=False, randomize_initial_state=24), '_lqr-static-randomized', visualize=True) if hasattr(self.algorithm, 'policy_opt'): # Global policy static resets if self._hyperparams['num_pol_samples_static'] > 0: self.export_samples(self._take_policy_samples( N=self._hyperparams['num_pol_samples_static'], pol=self.algorithm.policy_opt.policy, rnd=False), '_pol-static', visualize=True) # Global policy random resets if self._hyperparams['num_pol_samples_random'] > 0: self.export_samples(self._take_policy_samples( N=self._hyperparams['num_pol_samples_random'], pol=self.algorithm.policy_opt.policy, rnd=True), '_pol-random', visualize=True) # Global policy state noise if self._hyperparams['num_pol_samples_random'] > 0: self.export_samples(self._take_policy_samples( N=self._hyperparams['num_pol_samples_random'], pol=self.algorithm.policy_opt.policy, rnd=False, randomize_initial_state=24), '_pol-static-randomized', visualize=True) self.visualize_training_progress()
def generate(self): """ Generate demos and save them in a file for experiment. Returns: None. """ # Load the algorithm import pickle algorithm_file = self._algorithm_files_dir # This should give us the optimal controller. Maybe set to 'controller_itr_%02d.pkl' % itr_load will be better? self.algorithm = pickle.load(open(algorithm_file)) if self.algorithm is None: print("Error: cannot find '%s.'" % algorithm_file) os._exit(1) # called instead of sys.exit(), since t # Keep the initial states of the agent the sames as the demonstrations. self._learning = self.ioc_algo._hyperparams[ 'learning_from_prior'] # if the experiment is learning from prior experience agent_config = self._hyperparams['demo_agent'] if agent_config[ 'filename'] == './mjc_models/pr2_arm3d.xml' and not self._learning: agent_config['x0'] = self.algorithm._hyperparams['agent_x0'] agent_config['pos_body_idx'] = self.algorithm._hyperparams[ 'agent_pos_body_idx'] agent_config['pos_body_offset'] = self.algorithm._hyperparams[ 'agent_pos_body_offset'] self.agent = agent_config['type'](agent_config) # Roll out the demonstrations from controllers var_mult = self.algorithm._hyperparams['var_mult'] T = self.algorithm.T demos = [] M = agent_config['conditions'] N = self.ioc_algo._hyperparams['num_demos'] if not self._learning: controllers = {} good_conds = self.ioc_algo._hyperparams['demo_cond'] # Store each controller under M conditions into controllers. for i in xrange(M): controllers[i] = self.algorithm.cur[i].traj_distr controllers_var = copy.copy(controllers) for i in xrange(M): # Increase controller variance. controllers_var[i].chol_pol_covar *= var_mult # Gather demos. for j in xrange(N): demo = self.agent.sample( controllers_var[i], i, verbose=(i < self.algorithm._hyperparams['demo_verbose']), save=True) demos.append(demo) else: # Extract the neural network policy. pol = self.algorithm.policy_opt.policy for i in xrange(M): # Gather demos. demo = self.agent.sample( pol, i, verbose=(i < self._hyperparams['verbose_trials'])) demos.append(demo) # Filter out worst (M - good_conds) demos. target_position = agent_config['target_end_effector'][:3] dists_to_target = np.zeros(M) for i in xrange(M): demo_end_effector = demos[i].get(END_EFFECTOR_POINTS) dists_to_target[i] = np.amin(np.sqrt( np.sum((demo_end_effector[:, :3] - target_position.reshape(1, -1))**2, axis=1)), axis=0) if not self._learning: good_indices = dists_to_target.argsort()[:good_conds - M].tolist() else: good_indicators = (dists_to_target <= agent_config['success_upper_bound']).tolist() good_indices = [ i for i in xrange(len(good_indicators)) if good_indicators[i] ] bad_indices = np.argmax(dists_to_target) self.ioc_algo._hyperparams['demo_cond'] = len(good_indices) filtered_demos = [] self.ioc_algo.demo_conditions = [] self.ioc_algo.failed_conditions = [] exp_dir = self._data_files_dir.replace("data_files", "") with open(exp_dir + 'log.txt', 'a') as f: f.write('\nThe demo conditions are: \n') for i in good_indices: filtered_demos.append(demos[i]) self.ioc_algo.demo_conditions.append( agent_config['pos_body_offset'][i]) with open(exp_dir + 'log.txt', 'a') as f: f.write('\n' + str(agent_config['pos_body_offset'][i]) + '\n') with open(exp_dir + 'log.txt', 'a') as f: f.write('\nThe failed badmm conditions are: \n') for i in xrange(M): if i not in good_indices: self.ioc_algo.failed_conditions.append( agent_config['pos_body_offset'][i]) with open(exp_dir + 'log.txt', 'a') as f: f.write('\n' + str(agent_config['pos_body_offset'][i]) + '\n') # import pdb; pdb.set_trace() shuffle(filtered_demos) demo_list = SampleList(filtered_demos) demo_store = { 'demoX': demo_list.get_X(), 'demoU': demo_list.get_U(), 'demoO': demo_list.get_obs() } if self._learning: demo_store['pos_body_offset'] = [ agent_config['pos_body_offset'][bad_indices] ] # Save the demos. self.data_logger.pickle(self._data_files_dir + 'demos.pkl', copy.copy(demo_store))