예제 #1
0
def sample_paths_one_core(N,
                          policy,
                          T=1e6,
                          env=None,
                          env_name=None,
                          pegasus_seed=None,
                          mode='sample'):
    """
    params:
    N               : number of sample points
    policy          : policy to be used to sample the data
    T               : maximum length of trajectory
    env             : env object to sample from
    env_name        : name of env to be sampled from 
                      (one of env or env_name must be specified)
    pegasus_seed    : seed for environment (numpy speed must be set externally)
    """

    if env_name is None and env is None:
        print("No environment specified! Error will be raised")
    if env is None: env = get_environment(env_name)
    if pegasus_seed is not None: env.env._seed(pegasus_seed)
    T = min(T, env.horizon)

    start_time = timer.time()

    print("####### Gathering Samples #######")
    sampled_so_far = 0
    paths = []
    seed = pegasus_seed if pegasus_seed is not None else 0

    while sampled_so_far < N:
        if mode == 'sample':
            this_path = base_sampler.do_rollout(1, policy, T, env, env_name,
                                                seed)  # do 1 rollout
        elif mode == 'evaluation':
            this_path = eval_sampler.do_evaluation_rollout(
                1, policy, env, env_name, seed)
        else:
            print(
                "Mode has to be either 'sample' for training time or 'evaluation' for test time performance"
            )
            break
        paths.append(this_path[0])
        seed += 1
        sampled_so_far += len(this_path[0]["rewards"])

    print("======= Samples Gathered  ======= | >>>> Time taken = %f " %
          (timer.time() - start_time))
    print(
        "................................. | >>>> # samples = %i # trajectories = %i "
        % (sampled_so_far, len(paths)))
    return paths
예제 #2
0
def sample_paths_parallel(N,
                          policy,
                          T=1e2,
                          env_name=None,
                          pegasus_seed=None,
                          num_cpu='max',
                          max_process_time=300,
                          max_timeouts=4,
                          suppress_print=False,
                          mode='sample'):

    if num_cpu == None or num_cpu == 'max':
        num_cpu = mp.cpu_count()
    elif num_cpu == 1:
        return base_sampler.do_rollout(N, policy, T, None, env_name,
                                       pegasus_seed)
    else:
        num_cpu = min(mp.cpu_count(), num_cpu)

    paths_per_cpu = int(np.ceil(N / num_cpu))
    args_list = []
    for i in range(num_cpu):
        if pegasus_seed is None:
            args_list_cpu = [
                paths_per_cpu, policy, T, None, env_name, pegasus_seed
            ]
        else:
            args_list_cpu = [
                paths_per_cpu, policy, T, None, env_name,
                pegasus_seed + i * paths_per_cpu
            ]
        args_list.append(args_list_cpu)

    # Do multiprocessing
    if suppress_print == False:
        start_time = timer.time()
        print("####### Gathering Samples #######")

    results = _try_multiprocess(args_list, num_cpu, max_process_time,
                                max_timeouts, mode)
    paths = []
    # result is a paths type and results is list of paths
    for result in results:
        for path in result:
            paths.append(path)

    if suppress_print == False:
        print("======= Samples Gathered  ======= | >>>> Time taken = %f " %
              (timer.time() - start_time))

    return paths
예제 #3
0
def sample_paths(N,
                 policy,
                 T=1e2,
                 env=None,
                 env_name=None,
                 pegasus_seed=None,
                 mode='sample'):
    """ 
    Function to sample path
    :param mode : 'sample' means base_sampler rollout, 'evaluation' means it tests policy based on given path
    :return : path dictionary with all data regarding a path
    """
    if mode == 'sample':
        return base_sampler.do_rollout(N, policy, T, env, env_name,
                                       pegasus_seed)
    elif mode == 'evaluation':
        return eval_sampler.do_evaluation_rollout(N, policy, env, env_name,
                                                  pegasus_seed)
    else:
        print(
            "Mode has to be either 'sample' for training time or 'evaluation' for test time performance"
        )
예제 #4
0
            observations.shape[0]),
                                                 size=2,
                                                 replace=False)
        first_obs = observations[first_obs]
        second_obs = observations[second_obs]
        pairwise_dist = np.linalg.norm(first_obs - second_obs)
        array[i] = pairwise_dist
    mean = np.mean(array)
    return mean


if (__name__ == "__main__"):

    # x = [0,0,0,0,1]
    # gamma = 0.9
    # y = discount_sum(x, gamma)
    # print(y)

    N = 1
    pol = base_sampler.RandomPolicy((-2, 2))
    T = 5
    env = None
    env_name = "Pendulum-v0"
    env = GymEnv(env_name)
    gamma = 0.9
    paths = base_sampler.do_rollout(N, pol, T, env)
    compute_returns(paths, gamma)
    baseline = linear_baseline.LinearBaseline(env.spec)
    compute_advantages(paths, baseline, gamma, gae_lambda=0.9)
    mean = get_avg_step_distance(paths)
    print(mean)
예제 #5
0
        Predicts the value function for each state with a set of fit weights
        :param path : it is dictionary as returned by base_sampler
        :return : Returns a list containing the predicted value function for each state in the path
        """
        if self._coeffs is None:
            return np.zeros(len(path["rewards"]))
        return self._features(path).dot(self._coeffs)


if (__name__ == "__main__"):
    #What is this code trying to do?
    N = 1
    pol = RandomPolicy((-2, 2))
    T = 5
    env_name = 'Pendulum-v0'
    y = base_sampler.do_rollout(N=N, policy=pol, T=5, env_name=env_name)
    y2 = base_sampler.do_rollout(N=N, policy=pol, T=5, env_name=env_name)

    # print(y)
    env = GymEnv(env_name)
    base_line = LinearBaseline(env.spec)
    features = base_line._features(y[0])
    print(features)
    compute_returns(y, 1)
    compute_returns(y2, 1)
    # print('returns: ', y[0]['returns'])
    # print(base_line.predict(y[0]))
    errors = base_line.fit(y, True)
    # print('y : ', y)
    # print('returns: ', y2[4]['returns'])
    # print('predict: ', base_line.predict(y2[4]))