def _worker_set_policy_params(G, params, ma_mode, scope=None): G = _get_scoped_G(G, scope) if ma_mode == 'concurrent': for pid, policy in enumerate(G.policies): policy.set_param_values(params[pid]) else: G.policy.set_param_values(params)
def train_worker(G, g_counter, g_opt_info, t_max, discount, lock, scope=None): G = _get_scoped_G(G, scope=scope) env = G.env f_train_vfunc = g_opt_info.value["f_train_vfunc"] f_train_policy = g_opt_info.value["f_train_policy"] policy = g_opt_info.value["target_policy"] vfunc = g_opt_info.value["target_vfunc"] obs = env.reset() observations = [] actions = [] rewards = [] paths = [] done = False while True: for _ in range(t_max): if done: env.reset() action, _ = policy.get_action(obs) next_obs, reward, done, info = env.step(action) observations.append(obs) actions.append(action) rewards.append(reward) obs = next_obs with lock: g_counter.value += 1 if done: break # make it not expanding observations = observations[-t_max:] actions = actions[-t_max:] rewards = rewards[-t_max:] path = dict(observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards)) advantages = [] returns = [] if done: return_so_far = 0 else: return_so_far = vfunc.get_vval(obs) for t in range(len(rewards) - 2, -1, -1): return_so_far = rewards[t] + discount * return_so_far returns.append(return_so_far) advantage = return_so_far - path_baseline[t] advantages.append(advantage) # The advantages are stored backwards in time, so we need to revert it advantages = np.array(advantages[::-1]) # The returns are stored backwards in time, so we need to revert it returns = returns[::-1] # normalize advantages advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8) path["advantages"] = advantages path["returns"] = returns paths.apend(path) do_training(g_opt_info, path)
def _worker_collect_one_path_snn(self, G, max_path_length, switch_lat_every=0, scope=None): G = parallel_sampler._get_scoped_G(G, scope) # print("G", G) # print("use source policy") path = rollout_snn(G.env, G.policy, max_path_length, switch_lat_every=switch_lat_every) # print("path_length", len(path["rewards"])) return path, len(path["rewards"])
def _worker_collect_path_one_env_a2c(G, max_path_length, ma_mode, scope=None): G = _get_scoped_G(G, scope) if ma_mode == 'decentralized': paths = dec_roll_out_a2c(G.env, G.policy, max_path_length) lengths = [len(path['rewards']) for path in paths] return paths, sum(lengths) else: raise NotImplementedError("incorrect rollout type")
def _worker_collect_path_one_env(G, max_path_length, ma_mode, scope=None): G = parallel_sampler._get_scoped_G(G, scope) if (isinstance(G.env.wrapped_env, SimPyRollout)): paths = ed_simpy_dec_rollout(G.env, G.policy, max_path_length) else: paths = ed_dec_rollout(G.env, G.policy, max_path_length) lengths = [len(path['rewards']) for path in paths] return paths, sum(lengths)
def _worker_populate_task(G, env, policy, ma_mode, scope=None): # TODO: better term for both policy/policies G = _get_scoped_G(G, scope) G.env = pickle.loads(env) if ma_mode == 'concurrent': G.policies = pickle.loads(policy) assert isinstance(G.policies, list) else: G.policy = pickle.loads(policy)
def _worker_collect_path_random_one_env(G, max_path_length, ma_mode, sampler, scope=None): G = _get_scoped_G(G, scope) if ma_mode == 'decentralized': paths = dec_roll_out_random(G.env, G.policy, sampler, max_path_length) return paths, paths else: raise NotImplementedError("incorrect rollout type")
def _worker_collect_adr_one_env(G, max_path_length, ma_mode, scope=None): G = parallel_sampler._get_scoped_G(G, scope) paths = ed_simpy_dec_rollout(G.env, G.policy, max_path_length) adr = [] for path in paths: t_sojourn = path["offset_t_sojourn"] gamma = G.env.wrappend_env.discount discount_gamma = np.exp(-gamma * t_sojourn) path_adr = variable_discount_cumsum(path["rewards"], discount_gamma) avg_discounted_return = path_adr[0] adr.append(avg_discounted_return) return mean(adr), 1
def populate_task(env, policy, scope=None): logger.log("Populating workers...") if singleton_pool.n_parallel > 1: singleton_pool.run_each( _worker_populate_task, [(pickle.dumps(env), pickle.dumps(policy), scope)] * singleton_pool.n_parallel) else: # avoid unnecessary copying # still some issues when doing multiple copies G = _get_scoped_G(singleton_pool.G, scope) G.env = pickle.loads(pickle.dumps(env)) G.policy = pickle.loads(pickle.dumps(policy)) logger.log("Populated")
def _worker_terminate_task(G, scope=None): G = _get_scoped_G(G, scope) if getattr(G, "env", None): G.env.terminate() G.env = None if getattr(G, "policy", None): G.policy.terminate() G.policy = None if getattr(G, "policies", None): for policy in G.policies: policy.terminate() G.policies = None if getattr(G, "sess", None): G.sess.close() G.sess = None
def populate_task(env, policy, ma_mode, scope=None): logger.log("Populating workers...") logger.log("ma_mode={}".format(ma_mode)) if singleton_pool.n_parallel > 1: singleton_pool.run_each( _worker_populate_task, [(pickle.dumps(env), pickle.dumps(policy), ma_mode, scope)] * singleton_pool.n_parallel) else: # avoid unnecessary copying G = _get_scoped_G(singleton_pool.G, scope) G.env = env if ma_mode == 'concurrent': G.policies = policy else: G.policy = policy logger.log("Populated")
def _worker_collect_path_one_env(G, max_path_length, ma_mode, scope=None): G = _get_scoped_G(G, scope) if ma_mode == 'centralized': path = cent_rollout(G.env, G.policy, max_path_length) return path, len(path['rewards']) elif ma_mode == 'decentralized': paths = dec_rollout(G.env, G.policy, max_path_length) # print('_worker_collect_path_one_env number of paths = {}'.format(len(paths))) lengths = [ len(path['rewards']) for path in paths ] # limited by MAX_TIME_STEPS of env and max_path_length of sampler # print('mean path len = {}'.format(np.mean(lengths))) return paths, sum(lengths) elif ma_mode == 'concurrent': paths = conc_rollout(G.env, G.policies, max_path_length) lengths = [len(path['rewards']) for path in paths] return paths, lengths[0] else: raise NotImplementedError("incorrect rollout type")