def _eval_cost(self, cond, prev_cost=False): """ Evaluate costs for all samples for a condition. Args: cond: Condition to evaluate cost on. prev: Whether or not to use previous_cost (for ioc stepadjust) """ # Constants. T, dX, dU = self.T, self.dX, self.dU synN = self._hyperparams['synthetic_cost_samples'] if synN > 0: agent = self.cur[cond].sample_list.get_samples()[0].agent X, U, _ = self._traj_samples(cond, synN) syn_samples = [] for i in range(synN): sample = Sample(agent) sample.set_XU(X[i, :, :], U[i, :, :]) syn_samples.append(sample) all_samples = SampleList(syn_samples + self.cur[cond].sample_list.get_samples()) else: all_samples = self.cur[cond].sample_list N = len(all_samples) # Compute cost. cs = np.zeros((N, T)) cc = np.zeros((N, T)) cv = np.zeros((N, T, dX + dU)) Cm = np.zeros((N, T, dX + dU, dX + dU)) if self._hyperparams['ioc']: cgt = np.zeros((N, T)) for n in range(N): sample = all_samples[n] # Get costs. if prev_cost: l, lx, lu, lxx, luu, lux = self.previous_cost[cond].eval( sample) else: l, lx, lu, lxx, luu, lux = self.cost[cond].eval(sample) # Compute the ground truth cost if self._hyperparams['ioc'] and n >= synN: l_gt, _, _, _, _, _ = self.gt_cost[cond].eval(sample) cgt[n, :] = l_gt cc[n, :] = l cs[n, :] = l # Assemble matrix and vector. cv[n, :, :] = np.c_[lx, lu] Cm[n, :, :, :] = np.concatenate( (np.c_[lxx, np.transpose(lux, [0, 2, 1])], np.c_[lux, luu]), axis=1) # Adjust for expanding cost around a sample. X = sample.get_X() U = sample.get_U() yhat = np.c_[X, U] rdiff = -yhat rdiff_expand = np.expand_dims(rdiff, axis=2) cv_update = np.sum(Cm[n, :, :, :] * rdiff_expand, axis=1) cc[n, :] += np.sum(rdiff * cv[n, :, :], axis=1) + 0.5 * \ np.sum(rdiff * cv_update, axis=1) cv[n, :, :] += cv_update # Fill in cost estimate. if prev_cost: traj_info = self.cur[cond].prevcost_traj_info traj_info.dynamics = self.cur[cond].traj_info.dynamics traj_info.x0sigma = self.cur[cond].traj_info.x0sigma traj_info.x0mu = self.cur[cond].traj_info.x0mu else: traj_info = self.cur[cond].traj_info self.cur[cond].cs = cs[synN:] # True value of cost. traj_info.cc = np.mean(cc, 0) # Constant term (scalar). traj_info.cv = np.mean(cv, 0) # Linear term (vector). traj_info.Cm = np.mean(Cm, 0) # Quadratic term (matrix). if self._hyperparams['ioc']: self.cur[cond].cgt = cgt[synN:]
def _eval_cost(self, cond, prev_cost=False): """ Evaluate costs for all samples for a condition. Args: cond: Condition to evaluate cost on. prev: Whether or not to use previous_cost (for ioc stepadjust) """ # Constants. T, dX, dU = self.T, self.dX, self.dU synN = self._hyperparams['synthetic_cost_samples'] if synN > 0: agent = self.cur[cond].sample_list.get_samples()[0].agent X, U, _ = self._traj_samples(cond, synN) syn_samples = [] for i in range(synN): sample = Sample(agent) sample.set_XU(X[i, :, :], U[i, :, :]) syn_samples.append(sample) all_samples = SampleList(syn_samples + self.cur[cond].sample_list.get_samples()) else: all_samples = self.cur[cond].sample_list N = len(all_samples) # Compute cost. cs = np.zeros((N, T)) cc = np.zeros((N, T)) cv = np.zeros((N, T, dX+dU)) Cm = np.zeros((N, T, dX+dU, dX+dU)) if self._hyperparams['ioc']: cgt = np.zeros((N, T)) for n in range(N): sample = all_samples[n] # Get costs. if prev_cost: l, lx, lu, lxx, luu, lux = self.previous_cost[cond].eval(sample) else: l, lx, lu, lxx, luu, lux = self.cost[cond].eval(sample) # Compute the ground truth cost if self._hyperparams['ioc'] and n >= synN: l_gt, _, _, _, _, _ = self.gt_cost[cond].eval(sample) cgt[n, :] = l_gt cc[n, :] = l cs[n, :] = l # Assemble matrix and vector. cv[n, :, :] = np.c_[lx, lu] Cm[n, :, :, :] = np.concatenate( (np.c_[lxx, np.transpose(lux, [0, 2, 1])], np.c_[lux, luu]), axis=1 ) # Adjust for expanding cost around a sample. X = sample.get_X() U = sample.get_U() yhat = np.c_[X, U] rdiff = -yhat rdiff_expand = np.expand_dims(rdiff, axis=2) cv_update = np.sum(Cm[n, :, :, :] * rdiff_expand, axis=1) cc[n, :] += np.sum(rdiff * cv[n, :, :], axis=1) + 0.5 * \ np.sum(rdiff * cv_update, axis=1) cv[n, :, :] += cv_update # Fill in cost estimate. if prev_cost: traj_info = self.cur[cond].prevcost_traj_info traj_info.dynamics = self.cur[cond].traj_info.dynamics traj_info.x0sigma = self.cur[cond].traj_info.x0sigma traj_info.x0mu = self.cur[cond].traj_info.x0mu else: traj_info = self.cur[cond].traj_info self.cur[cond].cs = cs[synN:] # True value of cost. traj_info.cc = np.mean(cc, 0) # Constant term (scalar). traj_info.cv = np.mean(cv, 0) # Linear term (vector). traj_info.Cm = np.mean(Cm, 0) # Quadratic term (matrix). if self._hyperparams['ioc']: self.cur[cond].cgt = cgt[synN:]
def sample( self, policy, condition, verbose=True, save=True, noisy=True, use_TfController=False, timeout=None, reset_cond=None, record=False ): """ Reset and execute a policy and collect a sample. Args: policy: A Policy object. condition: Which condition setup to run. verbose: Unused for this agent. save: Whether or not to store the trial into the samples. noisy: Whether or not to use noise during sampling. use_TfController: Whether to use the syncronous TfController Returns: sample: A Sample object. """ if noisy: noise = generate_noise(self.T, self.dU, self._hyperparams) else: noise = np.zeros((self.T, self.dU)) # Get a new sample sample = Sample(self) self.env.video_callable = lambda episode_id, record=record: record # Get initial state self.env.seed(None if reset_cond is None else self.x0[reset_cond]) obs = self.env.reset() if self._hyperparams.get('initial_step', 0) > 0: # Take one random step to get a slightly random initial state distribution U_initial = (self.env.action_space.high - self.env.action_space.low ) / 12 * np.random.normal(size=self.dU) * self._hyperparams['initial_step'] obs = self.env.step(U_initial)[0] self.set_states(sample, obs, 0) U_0 = policy.act(sample.get_X(0), sample.get_obs(0), 0, noise) sample.set(ACTION, U_0, 0) for t in range(1, self.T): if not record and self.render: self.env.render(mode='human') # TODO add hyperparam # Get state obs, _, done, _ = self.env.step(sample.get_U(t - 1)) self.set_states(sample, obs, t) # Get action U_t = policy.act(sample.get_X(t), sample.get_obs(t), t, noise) sample.set(ACTION, U_t, t) if done and t < self.T - 1: raise Exception('Iteration ended prematurely %d/%d' % (t + 1, self.T)) if save: self._samples[condition].append(sample) self.active = False #print("X", sample.get_X()) #print("U", sample.get_U()) return sample
def sample(self, policy, condition, save=True, noisy=True, reset_cond=None, **kwargs): """ Reset and execute a policy and collect a sample. Args: policy: A Policy object. condition: Which condition setup to run. verbose: Unused for this agent. save: Whether or not to store the trial into the samples. noisy: Whether or not to use noise during sampling. use_TfController: Whether to use the syncronous TfController Returns: sample: A Sample object. """ # Get a new sample sample = Sample(self) sample_ok = False while not sample_ok: if not self.debug: self.reset(reset_cond) self.__init_opcua() if noisy: noise = generate_noise(self.T, self.dU, self._hyperparams) else: noise = np.zeros((self.T, self.dU)) # Execute policy over a time period of [0,T] start = time.time() for t in range(self.T): # Read sensors and store sensor data in sample def store_sensor(sensor): sample.set(sensor, self.read_sensor(sensor), t) self.pool.map(store_sensor, self.sensors) # Override sensors for override in self.sensor_overrides: if override['condition'](t): sensor = override['sensor'] sample.set(sensor, np.copy(override['value']), t) print('X_%02d' % t, sample.get_X(t)) # Get action U_t = policy.act(sample.get_X(t), sample.get_obs(t), t, noise) # Override actuators for override in self.actuator_overrides: if override['condition'](t): actuator = override['actuator'] U_t[self._u_data_idx[actuator]] = np.copy(override['value']) # Send signals self.send_signals(t) # Perform action for actuator in self._u_data_idx: self.write_actuator(actuator, U_t[self._u_data_idx[actuator]]) sample.set(ACTION, U_t, t) print('U_%02d' % t, U_t) # Check if agent is keeping up sleep_time = start + (t + 1) * self.dt - time.time() if sleep_time < 0: logging.critical("Agent can't keep up. %fs bedind." % sleep_time) elif sleep_time < self.dt / 2: logging.warning( "Agent may not keep up (%.0f percent busy)" % (((self.dt - sleep_time) / self.dt) * 100) ) # Wait for next timestep if sleep_time > 0 and not self.debug: time.sleep(sleep_time) if save: self._samples[condition].append(sample) self.active = False self.finalize_sample() sample_ok = input('Continue?') == 'y' if not sample_ok: print('Repeating') return sample