Exemplo n.º 1
0
def _worker_set_policy_params(G, params, ma_mode, scope=None):
    G = _get_scoped_G(G, scope)
    if ma_mode == 'concurrent':
        for pid, policy in enumerate(G.policies):
            policy.set_param_values(params[pid])
    else:
        G.policy.set_param_values(params)
Exemplo n.º 2
0
def train_worker(G, g_counter, g_opt_info, t_max, discount, lock, scope=None):
    G = _get_scoped_G(G, scope=scope)
    env = G.env
    f_train_vfunc = g_opt_info.value["f_train_vfunc"]
    f_train_policy = g_opt_info.value["f_train_policy"]
    policy = g_opt_info.value["target_policy"]
    vfunc = g_opt_info.value["target_vfunc"]

    obs = env.reset()
    observations = []
    actions = []
    rewards = []
    paths = []
    done = False
    while True:
        for _ in range(t_max):
            if done:
                env.reset()
            action, _ = policy.get_action(obs)
            next_obs, reward, done, info = env.step(action)
            observations.append(obs)
            actions.append(action)
            rewards.append(reward)
            obs = next_obs
            with lock:
                g_counter.value += 1
            if done:
                break

        # make it not expanding
        observations = observations[-t_max:]
        actions = actions[-t_max:]
        rewards = rewards[-t_max:]

        path = dict(observations=np.array(observations),
                    actions=np.array(actions),
                    rewards=np.array(rewards))

        advantages = []
        returns = []
        if done: return_so_far = 0
        else: return_so_far = vfunc.get_vval(obs)
        for t in range(len(rewards) - 2, -1, -1):
            return_so_far = rewards[t] + discount * return_so_far
            returns.append(return_so_far)
            advantage = return_so_far - path_baseline[t]
            advantages.append(advantage)
        # The advantages are stored backwards in time, so we need to revert it
        advantages = np.array(advantages[::-1])
        # The returns are stored backwards in time, so we need to revert it
        returns = returns[::-1]
        # normalize advantages
        advantages = (advantages - np.mean(advantages)) / (np.std(advantages) +
                                                           1e-8)

        path["advantages"] = advantages
        path["returns"] = returns
        paths.apend(path)

        do_training(g_opt_info, path)
 def _worker_collect_one_path_snn(self, G, max_path_length, switch_lat_every=0, scope=None):
     G = parallel_sampler._get_scoped_G(G, scope)
     # print("G", G)
     # print("use source policy")
     path = rollout_snn(G.env, G.policy, max_path_length, switch_lat_every=switch_lat_every)
     # print("path_length", len(path["rewards"]))
     return path, len(path["rewards"])
Exemplo n.º 4
0
def _worker_collect_path_one_env_a2c(G, max_path_length, ma_mode, scope=None):
    G = _get_scoped_G(G, scope)
    if ma_mode == 'decentralized':
        paths = dec_roll_out_a2c(G.env, G.policy, max_path_length)
        lengths = [len(path['rewards']) for path in paths]
        return paths, sum(lengths)
    else:
        raise NotImplementedError("incorrect rollout type")
Exemplo n.º 5
0
def _worker_collect_path_one_env(G, max_path_length, ma_mode, scope=None):
    G = parallel_sampler._get_scoped_G(G, scope)
    if (isinstance(G.env.wrapped_env, SimPyRollout)):
        paths = ed_simpy_dec_rollout(G.env, G.policy, max_path_length)
    else:
        paths = ed_dec_rollout(G.env, G.policy, max_path_length)
    lengths = [len(path['rewards']) for path in paths]
    return paths, sum(lengths)
Exemplo n.º 6
0
def _worker_populate_task(G, env, policy, ma_mode, scope=None):
    # TODO: better term for both policy/policies
    G = _get_scoped_G(G, scope)
    G.env = pickle.loads(env)
    if ma_mode == 'concurrent':
        G.policies = pickle.loads(policy)
        assert isinstance(G.policies, list)
    else:
        G.policy = pickle.loads(policy)
Exemplo n.º 7
0
def _worker_collect_path_random_one_env(G,
                                        max_path_length,
                                        ma_mode,
                                        sampler,
                                        scope=None):
    G = _get_scoped_G(G, scope)
    if ma_mode == 'decentralized':
        paths = dec_roll_out_random(G.env, G.policy, sampler, max_path_length)
        return paths, paths
    else:
        raise NotImplementedError("incorrect rollout type")
Exemplo n.º 8
0
def _worker_collect_adr_one_env(G, max_path_length, ma_mode, scope=None):
    G = parallel_sampler._get_scoped_G(G, scope)
    paths = ed_simpy_dec_rollout(G.env, G.policy, max_path_length)
    adr = []
    for path in paths:
        t_sojourn = path["offset_t_sojourn"]
        gamma = G.env.wrappend_env.discount
        discount_gamma = np.exp(-gamma * t_sojourn)
        path_adr = variable_discount_cumsum(path["rewards"], discount_gamma)
        avg_discounted_return = path_adr[0]
        adr.append(avg_discounted_return)
    return mean(adr), 1
Exemplo n.º 9
0
def populate_task(env, policy, scope=None):
    logger.log("Populating workers...")
    if singleton_pool.n_parallel > 1:
        singleton_pool.run_each(
            _worker_populate_task,
            [(pickle.dumps(env), pickle.dumps(policy), scope)] *
            singleton_pool.n_parallel)
    else:
        # avoid unnecessary copying
        # still some issues when doing multiple copies
        G = _get_scoped_G(singleton_pool.G, scope)
        G.env = pickle.loads(pickle.dumps(env))
        G.policy = pickle.loads(pickle.dumps(policy))
    logger.log("Populated")
Exemplo n.º 10
0
def _worker_terminate_task(G, scope=None):
    G = _get_scoped_G(G, scope)
    if getattr(G, "env", None):
        G.env.terminate()
        G.env = None
    if getattr(G, "policy", None):
        G.policy.terminate()
        G.policy = None
    if getattr(G, "policies", None):
        for policy in G.policies:
            policy.terminate()
        G.policies = None
    if getattr(G, "sess", None):
        G.sess.close()
        G.sess = None
Exemplo n.º 11
0
def populate_task(env, policy, ma_mode, scope=None):
    logger.log("Populating workers...")
    logger.log("ma_mode={}".format(ma_mode))
    if singleton_pool.n_parallel > 1:
        singleton_pool.run_each(
            _worker_populate_task,
            [(pickle.dumps(env), pickle.dumps(policy), ma_mode, scope)] *
            singleton_pool.n_parallel)
    else:
        # avoid unnecessary copying
        G = _get_scoped_G(singleton_pool.G, scope)
        G.env = env
        if ma_mode == 'concurrent':
            G.policies = policy
        else:
            G.policy = policy
    logger.log("Populated")
Exemplo n.º 12
0
def _worker_collect_path_one_env(G, max_path_length, ma_mode, scope=None):
    G = _get_scoped_G(G, scope)
    if ma_mode == 'centralized':
        path = cent_rollout(G.env, G.policy, max_path_length)
        return path, len(path['rewards'])
    elif ma_mode == 'decentralized':
        paths = dec_rollout(G.env, G.policy, max_path_length)
        # print('_worker_collect_path_one_env number of paths = {}'.format(len(paths)))
        lengths = [
            len(path['rewards']) for path in paths
        ]  # limited by MAX_TIME_STEPS of env and max_path_length of sampler
        # print('mean path len = {}'.format(np.mean(lengths)))
        return paths, sum(lengths)
    elif ma_mode == 'concurrent':
        paths = conc_rollout(G.env, G.policies, max_path_length)
        lengths = [len(path['rewards']) for path in paths]
        return paths, lengths[0]
    else:
        raise NotImplementedError("incorrect rollout type")