Exemplo n.º 1
0
def varying_target_mixture(estimator_names, alpha_list):
    """
    Run multiple experiments that vary the alpha values used to mix policies
    """
    # environment
    length = 5
    env = taxi(length)
    n_state = env.n_state
    n_action = env.n_action

    # Policies
    pi_target = np.load(os.getcwd() + '/infinite_horizon_off_policy_estimation/taxi/taxi-policy/pi19.npy')
    pi_behavior = np.load(os.getcwd() + '/infinite_horizon_off_policy_estimation/taxi/taxi-policy/pi18.npy')

    # Sampling vars
    ts = 400 # truncate_size
    nt = 1000
    gm = 0.99
    nb_seeds = 12

    results = np.zeros( (len(alpha_list), len(estimator_names), nb_seeds) )
    for idx, alpha in enumerate(alpha_list):
        pi_behavior = alpha * pi_target + (1-alpha) * pi_behavior
        lam_fct = partial(run_wrapper, n_state, n_action, env, roll_out, estimator_names, pi_behavior, 
                          pi_target, nt, ts, gm)
        ret = run_seeds_in_parallel(int(multiprocessing.cpu_count() / 2), lam_fct, estimator_names, nb_seeds)
        results[idx, :, :] = ret

    return results
Exemplo n.º 2
0
def varying_trajectories_and_length(estimator_names, nt_list, ts_list):
    # environment
    length = 5
    env = taxi(length)
    n_state = env.n_state
    n_action = env.n_action

    # Policies
    pi_target = np.load(os.getcwd() + '/infinite_horizon_off_policy_estimation/taxi/taxi-policy/pi19.npy')
    pi_behavior = np.load(os.getcwd() + '/infinite_horizon_off_policy_estimation/taxi/taxi-policy/pi18.npy')

    # Sampling vars
    alpha = 0.0
    gm = 0.995
    nb_seeds = 12

    results = np.zeros( (len(nt_list), len(ts_list), len(estimator_names), nb_seeds) )
    for i, nt in enumerate(nt_list):
        for j, ts in enumerate(ts_list):
            pi_behavior = alpha * pi_target + (1-alpha) * pi_behavior
            lam_fct = partial(run_wrapper, n_state, n_action, env, roll_out, estimator_names, pi_behavior, 
                            pi_target, nt, ts, gm)
            ret = run_seeds_in_parallel(int(multiprocessing.cpu_count() / 2), lam_fct, estimator_names, nb_seeds)
            results[i, j, :, :] = ret

    return results
Exemplo n.º 3
0
def varying_number_trajectories(estimator_names, nt_list = [200, 500, 1000, 2000]):
    """
    Run multiple experiments that vary the number of trajectories
    """
    # environment
    length = 5
    env = taxi(length)
    n_state = env.n_state
    n_action = env.n_action

    # Policies
    alpha = 0.0 # mixture ratio
    pi_target = np.load(os.getcwd() + '/infinite_horizon_off_policy_estimation/taxi/taxi-policy/pi19.npy')
    pi_behavior = np.load(os.getcwd() + '/infinite_horizon_off_policy_estimation/taxi/taxi-policy/pi18.npy')
    pi_behavior = alpha * pi_target + (1-alpha) * pi_behavior

    # Sampling vars
    ts = 400 # truncate_size
    gm = 0.99 # gamma
    nb_seeds = 12

    results = np.zeros( (len(nt_list), len(estimator_names), nb_seeds) )
    for idx, nt in enumerate(nt_list):
        lam_fct = partial(run_wrapper, n_state, n_action, env, roll_out, estimator_names, pi_behavior, 
                          pi_target, nt, ts, gm)
        ret = run_seeds_in_parallel(int(multiprocessing.cpu_count() / 2), lam_fct, estimator_names, nb_seeds)
        results[idx, :, :] = ret

    return results
Exemplo n.º 4
0
def varying_trajectories_and_alpha_distant_behavior_policy(estimator_names, nt_list, alpha_list):
    """
    Same as varying_trajectories_and_alpha, but we choose a behavior policy that is 
    much different from the target one.
    """
    # environment
    length = 5
    env = taxi(length)
    n_state = env.n_state
    n_action = env.n_action

    # Policies
    pi_target = np.load(os.getcwd() + '/infinite_horizon_off_policy_estimation/taxi/taxi-policy/pi19.npy')
    pi_behavior = np.load(os.getcwd() + '/infinite_horizon_off_policy_estimation/taxi/taxi-policy/pi13.npy')

    # Sampling vars
    ts = 400 # truncate_size
    gm = 0.995
    nb_seeds = 12

    results = np.zeros( (len(alpha_list), len(nt_list), len(estimator_names), nb_seeds) )
    for i, alpha in enumerate(alpha_list):
        for j, nt in enumerate(nt_list):
            pi_behavior = alpha * pi_target + (1-alpha) * pi_behavior
            lam_fct = partial(run_wrapper, n_state, n_action, env, roll_out, estimator_names, pi_behavior, 
                            pi_target, nt, ts, gm)
            ret = run_seeds_in_parallel(int(multiprocessing.cpu_count() / 2), lam_fct, estimator_names, nb_seeds)
            results[i, j, :, :] = ret

    return results
Exemplo n.º 5
0
        sys.stdout.flush()

    #executor.terminate()
    return res


if __name__ == "__main__":
    """
	Test if experiments work in paralel
	"""
    estimator_names = [
        'On Policy', 'Density Ratio', 'Naive Average', 'IST', 'ISS', 'WIST',
        'WISS', 'Model Based', 'DualDICE'
    ]
    length = 5
    env = taxi(length)
    n_state = env.n_state
    n_action = env.n_action

    num_trajectory = 50
    truncate_size = 50
    gamma = 0.995

    parser = argparse.ArgumentParser(description='taxi environment')
    parser.add_argument('--nt',
                        type=int,
                        required=False,
                        default=num_trajectory)
    parser.add_argument('--ts',
                        type=int,
                        required=False,