def regularization_paths(self, methods, n_samples=1000, n_eps=1, seed=1, criteria=["RMSBE"], verbose=0): # Intialization self._init_methods(methods) err_f = [self._init_error_fun(criterion) for criterion in criteria] errors = dict([(crit, [[] for m in methods]) for crit in criteria]) for m in methods: m.reset_trace() # Generate trajectories s, a, r, s_n, restarts = self.mdp.samples_cached(n_iter=n_samples, n_restarts=n_eps, policy=self.behavior_policy, seed=seed) if self.off_policy: m_a_beh = policies.mean_action_trajectory(self.behavior_policy, s) m_a_tar = policies.mean_action_trajectory(self.target_policy, s) rhos = np.zeros_like(r) self.rhos = rhos # Method learning with ProgressBar(enabled=(verbose > 2.)) as p: for i in xrange(n_samples * n_eps): p.update(i, n_samples * n_eps) f0 = self.phi(s[i]) f1 = self.phi(s_n[i]) if restarts[i]: for k, m in enumerate(methods): m.reset_trace() for k, m in enumerate(methods): if self.off_policy: rhos[i] = self.target_policy.p(s[i], a[i], mean=m_a_tar[i]) / self.behavior_policy.p(s[i], a[i], mean=m_a_beh[i]) m.update_V(s[i], s_n[i], r[i], rho=rhos[i], f0=f0, f1=f1) else: m.update_V(s[i], s_n[i], r[i], f0=f0, f1=f1) for i, m in enumerate(methods): v = m.regularization_path() for tau, theta in v: for i_e, crit in enumerate(criteria): errors[crit][i].append((tau, theta, err_f[i_e](theta))) return errors
def error_traces(self, methods, n_samples=1000, n_eps=1, verbose=0., seed=1, criteria=["RMSBE"], error_every=1, episodic=False, eval_on_traces=False, n_samples_eval=None): # Intialization self._init_methods(methods) err_f = [self._init_error_fun(criterion) for criterion in criteria] err_f_gen = [self._init_error_fun( criterion, general=True) for criterion in criteria] if episodic: n_e = n_eps k_e = 0 else: n_e = int(np.ceil(float(n_samples * n_eps) / error_every)) errors = np.ones((len(methods), len(criteria), n_e)) * np.inf for m in methods: m.reset_trace() # Generate trajectories with Timer("Generate Samples", active=(verbose > 1.)): s, a, r, s_n, restarts = self.mdp.samples_cached(n_iter=n_samples, n_restarts=n_eps, policy=self.behavior_policy, seed=seed, verbose=verbose) with Timer("Generate Double Samples", active=(verbose > 1.)): a2, r2, s_n2 = self.mdp.samples_cached_transitions( policy=self.behavior_policy, states=s, seed=seed) if eval_on_traces: print "Evaluation of traces samples" self.set_mu_from_states( seed=self.mu_seed, s=s, n_samples_eval=n_samples_eval) if self.off_policy: with Timer("Generate off-policy weights", active=(verbose > 1.)): m_a_beh = policies.mean_action_trajectory( self.behavior_policy, s) m_a_tar = policies.mean_action_trajectory( self.target_policy, s) rhos = np.zeros_like(r) rhos2 = np.zeros_like(r2) self.rhos = rhos # Method learning with ProgressBar(enabled=(verbose > 2.)) as p: for i in xrange(n_samples * n_eps): p.update(i, n_samples * n_eps) f0 = self.phi(s[i]) f1 = self.phi(s_n[i]) f1t = self.phi(s_n2[i]) if restarts[i]: for k, m in enumerate(methods): m.reset_trace() if episodic: cur_theta = m.theta if not np.isfinite(np.sum(cur_theta)): errors[k,:, k_e] = np.nan continue for i_e in range(len(criteria)): if isinstance(m, td.LinearValueFunctionPredictor): errors[k, i_e, k_e] = err_f[i_e](cur_theta) else: errors[k, i_e, k_e] = err_f_gen[i_e](m.V) if episodic: k_e += 1 if k_e >= n_e: break for k, m in enumerate(methods): if self.off_policy: rhos[i] = self.target_policy.p(s[i], a[i], mean=m_a_tar[i]) / self.behavior_policy.p(s[i], a[i], mean=m_a_beh[i]) rhos2[i] = self.target_policy.p(s[i], a2[i], mean=m_a_tar[i]) / self.behavior_policy.p(s[i], a2[i], mean=m_a_beh[i]) m.update_V(s[i], s_n[i], r[i], rho=rhos[i], rhot=rhos2[i], f0=f0, f1=f1, f1t=f1t, s1t=s_n[i], rt=r2[i]) else: m.update_V(s[i], s_n[i], r[i], f0=f0, f1=f1, s1t=s_n2[i], f1t=f1t, rt=r2[i]) if i % error_every == 0 and not episodic: cur_theta = m.theta if not np.isfinite(np.sum(cur_theta)): errors[k,:, int(i / error_every)] = np.nan continue for i_e in range(len(criteria)): if isinstance(m, td.LinearValueFunctionPredictor): errors[k, i_e, int( i / error_every)] = err_f[i_e](cur_theta) else: errors[k, i_e, int( i / error_every)] = err_f_gen[i_e](m.V) return errors
def error_traces_cpu_time(self, method, max_t=600, max_passes=None, min_diff=0.1, n_samples=1000, n_eps=1, verbose=0., seed=1, criteria=["RMSBE"], error_every=1, eval_on_traces=False, n_samples_eval=None, eval_once=False): # Intialization self._init_methods([method]) err_f = [self._init_error_fun(criterion) for criterion in criteria] err_f_gen = [self._init_error_fun( criterion, general=True) for criterion in criteria] times = [] errors = [] processed = [] method.reset_trace() if hasattr(method, "lam") and method.lam > 0.: print "WARNING: reuse of samples only works without e-traces" # Generate trajectories with Timer("Generate Samples", active=(verbose > 1.)): s, a, r, s_n, restarts = self.mdp.samples_cached(n_iter=n_samples, n_restarts=n_eps, policy=self.behavior_policy, seed=seed, verbose=verbose) with Timer("Generate Double Samples", active=(verbose > 1.)): a2, r2, s_n2 = self.mdp.samples_cached_transitions( policy=self.behavior_policy, states=s, seed=seed) if eval_on_traces: print "Evaluation of traces samples" self.set_mu_from_states( seed=self.mu_seed, s=s, n_samples_eval=n_samples_eval) if self.off_policy: with Timer("Generate off-policy weights", active=(verbose > 1.)): m_a_beh = policies.mean_action_trajectory( self.behavior_policy, s) m_a_tar = policies.mean_action_trajectory( self.target_policy, s) rhos = np.zeros_like(r) rhos2 = np.zeros_like(r2) self.rhos = rhos # Method learning i = 0 last_t = 0. passes = 0 u = 0 with ProgressBar(enabled=(verbose > 2.)) as p: while method.time < max_t: f0 = self.phi(s[i]) f1 = self.phi(s_n[i]) f1t = self.phi(s_n2[i]) #assert not np.any(np.isnan(f0)) #assert not np.any(np.isnan(f1)) #assert not np.any(np.isnan(f1t)) if restarts[i]: method.reset_trace() if self.off_policy: rhos[i] = self.target_policy.p(s[i], a[i], mean=m_a_tar[i]) / self.behavior_policy.p(s[i], a[i], mean=m_a_beh[i]) rhos2[i] = self.target_policy.p(s[i], a2[i], mean=m_a_tar[i]) / self.behavior_policy.p(s[i], a2[i], mean=m_a_beh[i]) method.update_V(s[i], s_n[i], r[i], rho=rhos[i], rhot=rhos2[i], f0=f0, f1=f1, f1t=f1t, s1t=s_n[i], rt=r2[i]) else: method.update_V(s[i], s_n[i], r[i], f0=f0, f1=f1, s1t=s_n2[i], f1t=f1t, rt=r2[i]) u+=1 assert(method.time > last_t) if method.time - last_t > min_diff: p.update(method.time, max_t) last_t = method.time if not eval_once: cur_theta = method.theta e = np.empty(len(criteria)) for i_e in range(len(criteria)): e[i_e] = err_f[i_e](cur_theta) errors.append(e) processed.append(u) times.append(method.time) i += 1 if i >= n_samples * n_eps: passes += 1 if max_passes is not None and passes >= max_passes: break i = i % (n_samples * n_eps) if eval_once: cur_theta = method.theta e = np.empty(len(criteria)) for i_e in range(len(criteria)): e[i_e] = err_f[i_e](cur_theta) return e, method.time return errors, processed, times