def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--rep', nargs="?", type=int, default=100, help='number of repetitions')
    parser.add_argument('--r0', nargs="?", type=float, default=1.0, help='value of r0')
    parser.add_argument('--r_prior', nargs="?", type=float, default=0.0, help='prior value of reward function')
    parser.add_argument('--optLb', nargs="?", type=float, default=1e-2, help='value of r0')
    parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data')
    parser.add_argument('--epi_step_num', nargs="?", type=int, default=100, help='number of episode steps')
    parser.add_argument('--rightprop', nargs="?", type=float, default=0.6,
                        help='warm start random exploration right probability')
    parser.add_argument('--rstd', nargs="?", type=float, default=1.0,
                        help='standard deviation of reward')
    parser.add_argument('--opt_ori', nargs="?", type=bool, default=False,
                        help='Q-OCBA optimization method')
    parser.add_argument('--num_value_iter', nargs="?", type=int, default=200, help='number of value iteration')
    parser.add_argument('--opt_one_step', nargs="?", type=bool, default=False,
                        help='Q-OCBA optimization running only one step')

    args = parser.parse_args()
    opt_ori = args.opt_ori
    print("Q-OCBA optimization method using original formulation is {}".format(opt_ori))
    num_rep = args.rep
    initial_s_dist = "even"
    Q_approximation = None
    right_prop = args.rightprop
    optLb = args.optLb
    s_0 = 2
    # collect data configuration
    n_s = 5
    print("n_s is {}".format(n_s))
    n_a = 2
    # value-iteration configuration
    num_iter = 200
    gamma = 0.95
    # real p and r
    p = np.zeros(n_s * n_a * n_s)
    Q_0 = np.zeros(n_s * n_a)
    V_0 = np.zeros(n_s)
    rou = np.ones(n_s) / n_s
    r = np.zeros(n_s * n_a)
    r[0] = args.r0
    r[-1] = 10.
    r_sd = args.rstd
    r_prior_mean = args.r_prior
    print("reward standard deviation is {}".format(r_sd))
    # r[0] = 10.
    # r[-1] = 0.1
    print("r[0] and r[-1] are {}, {}".format(r[0], r[-1]))
    p[0 * n_s * n_a + 0 * n_s + 0] = 1.
    p[0 * n_s * n_a + 1 * n_s + 0] = 0.7
    p[0 * n_s * n_a + 1 * n_s + 1] = 0.3
    for i in range(1, (n_s - 1)):
        p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1
        p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1
        p[i * n_a * n_s + 1 * n_s + i] = 0.6
        p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3
    p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3
    Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a)
    V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a)
    right_prop = args.rightprop


    Total_data = args.numdata
    print("total num of data is {}".format(Total_data))
    episode_steps = args.epi_step_num
    numdata_1 = 5
    print("warm start steps is {}".format(numdata_1))
    numdata_2 = Total_data
    print("epsisode timestep is {}".format(episode_steps))
    num_datas = [episode_steps] * (numdata_2/ episode_steps)
    #num_datas = [1000, 0]
    CS_num = 0.
    future_V = np.zeros(num_rep)
    Total_time = []
    #if use Bayesian prior as exploration
    Bayes_resample = False
    #optLbs = np.linspace(optLb, 1e-6, len(num_datas))
    ##print(optLbs)
    #exit()

    for ii in range(num_rep):
        time_rep = time.time()
        para_cl = parameter_prior(n_s,n_a, s_0, r_mean_prior =  r_prior_mean)
        data =  collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop,  std = r_sd)
        para_cl.update(data, resample = Bayes_resample)
        p_n, r_n, r_std = para_cl.get_para( resample = Bayes_resample)
        var_r_n = r_std **2
        #print(p_n)
        #print(r_n)
        #print(r_std)

        #test
        #p_n = p
        #r_n = r

        Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, args.num_value_iter , gamma, n_s, n_a)
        V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
        for jj, num_data in enumerate(num_datas):
            TM = inference.embedd_MC(p_n, n_s, n_a, V_n_max_index)
            I = np.identity(n_s * n_a)
            I_TM = np.linalg.inv(I - gamma * TM)
            V = np.diag(var_r_n)
            ds = []
            ds_V = []
            for i in range(n_s):
                for j in range(n_a):
                    p_sa = p_n[(i * n_a * n_s + j * n_s): (i * n_a * n_s + (j + 1) * n_s)]
                    dij = inference.cal_cov_p_quad_V(p_sa, V_n, n_s)
                    ds.append(dij)
                    if j == V_n_max_index[i]:
                        ds_V.append(dij)
            D = np.diag(ds)
            cov_V_D = V + D
            quad_consts = np.zeros((n_s, n_a))
            denom_consts = np.zeros((n_s, n_a, n_s * n_a))

            for i in range(n_s):
                for j in range(n_a):
                    if j != V_n_max_index[i]:
                        minus_op = np.zeros(n_s * n_a)
                        minus_op[i * n_a + j] = 1
                        minus_op[i * n_a + V_n_max_index[i]] = -1
                        denom_consts[i][j] = np.power(np.dot(minus_op, I_TM), 2) * np.diag(cov_V_D)
                        quad_consts[i][j] = (Q_n[i * n_a + j] - Q_n[i * n_a + V_n_max_index[i]]) ** 2

            A, b, G, h = two_stage_inference.construct_contrain_matrix(p_n, n_s, n_a)
            AA = np.array(A)
            #bb = np.asarray(b)


            if opt_ori:
                def fun(x):
                    return -x[0]
            else:
                def fun(x):
                    return x[0]
            constraints = []
            if opt_ori:
                for i in range(n_s):
                    for j in range(n_a):
                        if j != V_n_max_index[i]:
                            # print(denom_consts[i][j])
                            if np.max(denom_consts[i][j]) > 1e-5:
                                constraints.append({'type': 'ineq', 'fun': lambda x, up_c, denom_c: up_c / (
                                    np.sum(np.multiply(denom_c, np.reciprocal(x[1:])))) - x[0],
                                                    'args': (quad_consts[i][j], denom_consts[i][j])})
            else:
                for i in range(n_s):
                    for j in range(n_a):
                        if j != V_n_max_index[i]:
                            # print(denom_consts[i][j])
                            if np.max(quad_consts[i][j]) > 1e-5:
                                constraints.append({'type': 'ineq', 'fun': lambda x, up_c, denom_c: -(
                                    np.sum(np.multiply(denom_c, np.reciprocal(x[1:])))) / up_c + x[0],
                                                    'args': (quad_consts[i][j], denom_consts[i][j])})

            for i in range(AA.shape[0]):
                constraints.append(
                    {'type': 'eq', 'fun': lambda x, a, b: np.dot(a, x[1:]) - b, 'args': (AA[i], b[i])})
            constraints = tuple(constraints)
            bnds = []
            bnds.append((0., None))
            for i in range(n_s * n_a):
                bnds.append((optLb, 1))
                #bnds.append((optLbs[jj], 1))

            bnds = tuple(bnds)
            initial = np.ones(n_s * n_a + 1) / (n_s * n_a)

            initial[0] = 0.1
            # print(initial)
            # print("number of equality constraints is {}".format(len(A)))
            if args.opt_one_step:
                res = minimize(fun, initial, method='SLSQP', bounds=bnds,
                               constraints=constraints, options = {'disp':False, 'maxiter':1})
            else:
                res = minimize(fun, initial, method='SLSQP', bounds=bnds,
                               constraints=constraints)
            x_opt = res.x[1:]

            #exit()

            #print("***", para_cl.s)


            data = collect_data_swimmer.collect_data(p, r, num_data, para_cl.s, n_s, n_a, pi_s_a=x_opt,  std = r_sd)
            para_cl.update(data, resample = Bayes_resample)
            _, _, freq, _ = cal_impirical_r_p.cal_impirical_stats(data, n_s, n_a)
            #print("x_opt", x_opt)
            #print("freq", freq)
            #dist = np.linalg.norm(freq - x_opt)
            #dist = sklearn.metrics.mutual_info_score(freq, x_opt)
            #print(dist)

            p_n, r_n, r_std = para_cl.get_para(resample = Bayes_resample)
            var_r_n = r_std ** 2
            Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, args.num_value_iter, gamma, n_s, n_a)
            V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
        #print(p_n, r_n)
        #print(Q_n)
        Total_time.append(time.time() - time_rep)
        V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p, gamma)
        future_V[ii] = np.dot(rou, V_here)
        fS_bool = optimize_pfs.FS_bool(Q_n, V_max_index, n_s, n_a)
        CS_num += fS_bool
        fv = np.mean(future_V)
        fv_std = np.std(future_V)
        rv = np.dot(rou, V_real)
        diff = rv - fv
    PCS = np.float(CS_num) / num_rep
    CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep)
    print("Seq_Q_OCBA")
    print("PCS is {}, with CI length {}".format(PCS, CI_len))
    print("future value func is {} with CI length {}, real value is {}, diff is {}".format(fv, 1.96 * fv_std / np.sqrt(
        num_rep), rv, diff))
    runnung_time_mean = np.mean(Total_time)
    runnung_time_CI  = 1.96 * np.std(Total_time)/ np.sqrt(num_rep)
    print("average running time of Seq QOCBA is {} with CI length {}".format(runnung_time_mean, runnung_time_CI))
    #exit()

    # follow original
    CS_num_naive = 0
    future_V = np.zeros(num_rep)
    for i in range(num_rep):
        data = collect_data_swimmer.collect_data(p, r, Total_data, s_0, n_s, n_a, right_prop=right_prop)
        p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(data, n_s, n_a)
        Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a)
        # print(Q_n)
        V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p, gamma)
        # print(V_here, V_real)
        future_V[i] = np.dot(rou, V_here)
        fS_bool_ = optimize_pfs.FS_bool(Q_n, V_max_index, n_s, n_a)
        CS_num_naive += fS_bool_
        # if not FS_bool_:
        # print(i)
        # print(f_n)
        # print(Q_n)
    PCS_naive = np.float(CS_num_naive) / num_rep
    CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep)
    fv = np.mean(future_V)
    fv_std = np.std(future_V)
    rv = np.dot(rou, V_real)
    diff = rv - fv
    print("follow original")
    print("PCS is {}, with CI length {}".format(PCS_naive, CI_len))
    print("future value func is {} with CI length {}, real value is {}, diff is {}".format(fv, 1.96 * fv_std / np.sqrt(
        num_rep), rv, diff))
예제 #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--rep', nargs="?", type=int, default=100, help='number of repetitions')
    #parser.add_argument('--r0', nargs="?", type=float, default=1.0, help='value of r0')
    parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data')
    parser.add_argument('--rightprop', nargs="?", type=float, default=0.6,
                        help='warm start random exploration right probability')
    parser.add_argument('--rstd', nargs="?", type=float, default=1.0,
                        help='standard deviation of reward ')

    args = parser.parse_args()
    num_iter, gamma, n_s, n_a, delta,  num_rep = 200, 0.95, 5, 2, 0.05, args.rep
    right_prop = args.rightprop
    Q_0 = np.zeros(n_s * n_a)
    V_0 = np.zeros(n_s)
    p = np.zeros(n_s * n_a * n_s)
    r = np.zeros(n_s * n_a)
    for r0_val in range(1, 4):
        r[0] = float(r0_val)
        r[-1] = 10.
        r_std  = args.rstd
        # r[0] = 10.
        # r[-1] = 0.1
        print("r[0] and r[-1] are {}, {}".format(r[0], r[-1]))
        p[0 * n_s * n_a + 0 * n_s + 0] = 1.
        p[0 * n_s * n_a + 1 * n_s + 0] = 0.7
        p[0 * n_s * n_a + 1 * n_s + 1] = 0.3
        for i in range(1, (n_s - 1)):
            p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1
            p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1
            p[i * n_a * n_s + 1 * n_s + i] = 0.6
            p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3
        p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1
        p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7
        p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3
        Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a)
        V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a)
        print("Q real is {}".format(Q_real))
        s_0 = 2
        rou = np.ones(n_s) / n_s

        Q_approximation = None
        initial_s_dist = "even"
        if initial_s_dist == "even":
            R_real = np.mean(V_real)
            initial_w = np.ones(n_s) / n_s
        cov_bools_Q = np.zeros(n_s * n_a)
        cov_bools_V = np.zeros(n_s)
        cov_bools_R = 0.
        # print("Q real is {}".format(Q_real))
        # print("V real is {}".format(V_real))
        # print("R real is {}".format(R_real))
        CI_lens_Q = []
        CI_lens_V = []
        CI_lens_R = []
        numerical_tol = 1e-6
        S_0 = None

        ## UCRL
        CS_num = 0.
        num_data = args.numdata
        num_1  = num_data * 3/10
        num_2 = num_data * 7/10
        #print("smaller")
        future_V = np.zeros(num_rep)
        for i in range(num_rep):
            #all_data  = []
            while True:
                data1 = collect_data_swimmer.collect_data(p, r, num_1, s_0, n_s, n_a, right_prop=right_prop,  std = r_std)
                p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(data1, n_s, n_a)
                Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a)
                V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
                #print("first stage visiting frequency is {}".format(f_n))
                if f_n.all()!=0:
                    break
            #all_data += data1
            pre_collected_stats = get_pre_collected_stats(data1, n_s, n_a)
            UCRL_cl = UCRL(n_s, n_a, 0.05, num_1, s_0, num_data, pre_collected_stats)
            while UCRL_cl.t < num_data:
                UCRL_cl.update_point_estimate_and_CIbound()
                #print("step1 finished")
                UCRL_cl.Extended_Value_Iter()
                #print("step2 finished")
                UCRL_cl.collect_data_and_update(p,r, r_std = r_std)
                #print("step3 finished")
                #print(UCRL_cl.t)
            UCRL_cl.update_point_estimate_and_CIbound()
            Q_estimate =  Iterative_Cal_Q.cal_Q_val(UCRL_cl.transition, Q_0, UCRL_cl.rew, num_iter , gamma, n_s, n_a)
            #print(Q_estimate)
            FS_bool = optimize_pfs.FS_bool(Q_estimate, V_max_index, n_s, n_a)
            CS_num += FS_bool
            V_here = optimize_pfs.policy_val_iteration(Q_estimate, n_s, n_a, V_0, num_iter, r, p, gamma)
            # print(V_here, V_real)
            future_V[i] = np.dot(rou, V_here)
            datahere = data1 + UCRL_cl.datas
            Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI(Q_approximation, S_0, num_data, s_0,
                                                                           num_iter, gamma,
                                                                           Q_0, n_s, n_a, r, p, initial_w, right_prop,
                                                                           data=datahere)
            # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len))
            cov_bool_Q = np.logical_and(Q_real <= (Q_n + CI_len_Q + numerical_tol),
                                        Q_real >= (Q_n - CI_len_Q - numerical_tol))
            cov_bool_V = np.logical_and(V_real <= (V_n + CI_len_V + numerical_tol),
                                        V_real >= (V_n - CI_len_V - numerical_tol))
            cov_bool_R = np.logical_and(R_real <= (R_n + CI_len_R + numerical_tol),
                                        R_real >= (R_n - CI_len_R - numerical_tol))
            # print(cov_bool_Q)
            # exit()
            # print(cov_bool)
            cov_bools_Q += cov_bool_Q
            cov_bools_V += cov_bool_V
            cov_bools_R += cov_bool_R
            CI_lens_Q.append(CI_len_Q)
            CI_lens_V.append(CI_len_V)
            CI_lens_R.append(CI_len_R)
        PCS = np.float(CS_num) / num_rep
        CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep)
        fv = np.mean(future_V)
        fv_std = np.std(future_V)
        rv = np.dot(rou, V_real)
        diff = rv - fv
        # print(CS_num_naive)
        print("PCS is {}, with CI length {}".format(PCS, CI_len))
        print("future value func is {} with  CI length {}, real value is {}, diff is {}".format(fv, 1.96 * fv_std / np.sqrt(
            num_rep), rv, diff))

        CI_len_Q_mean = np.mean(CI_lens_Q)
        CI_len_V_mean = np.mean(CI_lens_V)
        CI_len_R_mean = np.mean(CI_lens_R)
        CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep)
        CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep)
        CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep)

        cov_rate_Q = np.divide(cov_bools_Q, num_rep)
        cov_rate_V = np.divide(cov_bools_V, num_rep)
        cov_rate_R = np.divide(cov_bools_R, num_rep)
        cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep)
        cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep)
        cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep)
        print("coverage for Q")
        print(cov_rate_Q)
        print(cov_rate_CI_Q)
        print("mean coverage for Q ")
        print(np.mean(cov_rate_Q))
        print(np.mean(cov_rate_CI_Q))
        print("coverage for V")
        print(cov_rate_V)
        print(cov_rate_CI_V)
        print("mean coverage for V")
        print(np.mean(cov_rate_V))
        print(np.mean(cov_rate_CI_V))
        print("coverage for R")
        print(cov_rate_R)
        print(cov_rate_CI_R)
        print("CI len for Q CI {} with ci {}".format(CI_len_Q_mean, CI_len_Q_ci))
        print("CI len for V CI {} with ci {}".format(CI_len_V_mean, CI_len_V_ci))
        print("CI len for R CI {} with ci {}".format(CI_len_R_mean, CI_len_R_ci))
예제 #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--rep',
                        nargs="?",
                        type=int,
                        default=100,
                        help='number of repetitions')
    parser.add_argument('--r0',
                        nargs="?",
                        type=float,
                        default=0.0,
                        help='value of r0')
    #parser.add_argument('--r_prior', nargs="?", type=float, default=1.0, help='prior value of reward function')
    parser.add_argument('--numdata',
                        nargs="?",
                        type=int,
                        default=1000,
                        help='number of data')
    parser.add_argument('--epi_step_num',
                        nargs="?",
                        type=int,
                        default=100,
                        help='number of episode steps')
    parser.add_argument('--rightprop',
                        nargs="?",
                        type=float,
                        default=0.6,
                        help='warm start random exploration right probability')
    parser.add_argument('--rstd',
                        nargs="?",
                        type=float,
                        default=1.0,
                        help='standard deviation of reward')
    parser.add_argument('--beta',
                        nargs="?",
                        type=float,
                        default=0.25,
                        help='beta')
    parser.add_argument('--two_stage',
                        nargs="?",
                        type=bool,
                        default=True,
                        help='if run two stage or sequential experiment')
    args = parser.parse_args()
    print("PSPE")
    num_iter, gamma, n_s, n_a, num_rep = 200, 0.95, 5, 2, args.rep
    right_prop = args.rightprop

    Q_0 = np.zeros(n_s * n_a)
    V_0 = np.zeros(n_s)
    p = np.zeros(n_s * n_a * n_s)
    r = np.zeros(n_s * n_a)
    r[0] = args.r0
    r[-1] = 10.
    r_std = args.rstd
    print("reward standard deviation is {}".format(r_std))
    # r[0] = 10.
    # r[-1] = 0.1
    print("r[0] and r[-1] are {}, {}".format(r[0], r[-1]))
    p[0 * n_s * n_a + 0 * n_s + 0] = 1.
    p[0 * n_s * n_a + 1 * n_s + 0] = 0.7
    p[0 * n_s * n_a + 1 * n_s + 1] = 0.3
    for i in range(1, (n_s - 1)):
        p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1
        p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1
        p[i * n_a * n_s + 1 * n_s + i] = 0.6
        p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3
    p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3
    Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a)
    V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a)
    #print("Q real is {}".format(Q_real))
    s_0 = 2

    ## PSPE
    if not args.two_stage:
        print("sequential implementation")
        Total_data = args.numdata
        print("total num of data is {}".format(Total_data))
        episode_steps = args.epi_step_num
        numdata_1 = episode_steps
        numdata_2 = Total_data - numdata_1
        print("epsisode timestep is {}".format(episode_steps))
        num_datas = [episode_steps] * (numdata_2 / episode_steps)
    else:
        print("two_stage implementation")
        Total_data = args.numdata
        print("total num of data is {}".format(Total_data))
        numdata_1 = Total_data * 3 / 10
        numdata_2 = Total_data - numdata_1
        episodes = 100
        num_datas = [numdata_2 / episodes] * episodes

    CS_num = 0.
    beta = args.beta
    rou = np.ones(n_s) / n_s
    future_V = np.zeros(num_rep)
    for i in range(num_rep):
        para_cl = parameter_prior(n_s, n_a, s_0)
        while True:
            data1 = collect_data_swimmer.collect_data(p,
                                                      r,
                                                      numdata_1,
                                                      s_0,
                                                      n_s,
                                                      n_a,
                                                      right_prop=right_prop,
                                                      std=r_std)
            p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(
                data1, n_s, n_a)
            Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma,
                                            n_s, n_a)
            V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
            # print("first stage visiting frequency is {}".format(f_n))
            if f_n.all() != 0:
                break
        para_cl.update(data1, r_sigma=r_std)
        Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma)
        #print(Q_estimate)

        for num_data in num_datas:
            data = collect_data_swimmer.collect_data(p,
                                                     r,
                                                     num_data,
                                                     para_cl.s_0,
                                                     n_s,
                                                     n_a,
                                                     Q=Q_estimate,
                                                     epsilon=0,
                                                     std=r_std)
            para_cl.update(data, r_sigma=r_std)
            Q_estimate_1 = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma)
            V_n_1, V_n_max_index_1 = inference.get_V_from_Q(
                Q_estimate_1, n_s, n_a)
            sim = np.random.binomial(1, beta, 1)[0]
            if sim:
                Q_estimate = Q_estimate_1
            else:
                while True:
                    Q_estimate_2 = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma)
                    V_n_2, V_n_max_index_2 = inference.get_V_from_Q(
                        Q_estimate_2, n_s, n_a)
                    if V_n_max_index_2 != V_n_max_index_1:
                        break
                Q_estimate = Q_estimate_2
        #print(Q_estimate)
        #print(para_cl.pprior)
        #print(para_cl.r_mean)
        V_here = optimize_pfs.policy_val_iteration(Q_estimate, n_s, n_a, V_0,
                                                   num_iter, r, p, gamma)
        # print(V_here, V_real)
        future_V[i] = np.dot(rou, V_here)
        FS_bool = optimize_pfs.FS_bool(Q_estimate, V_max_index, n_s, n_a)
        CS_num += FS_bool
    PCS = np.float(CS_num) / num_rep
    CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep)
    fv = np.mean(future_V)
    fv_std = np.std(future_V)
    rv = np.dot(rou, V_real)
    diff = rv - fv
    # print(CS_num_naive)
    print("PCS is {}, with CI length {}".format(PCS, CI_len))
    print(
        "future value func is {} with  CI length {}, real value is {}, diff is {}"
        .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--rep',
                        nargs="?",
                        type=int,
                        default=100,
                        help='number of repetitions')
    parser.add_argument('--episode',
                        nargs="?",
                        type=int,
                        default=100,
                        help='number of episode')
    #parser.add_argument('--r0', nargs = "?", type = float, default = 1.0, help = 'value of r0'  )
    parser.add_argument('--numdata',
                        nargs="?",
                        type=int,
                        default=1000,
                        help='number of data')
    parser.add_argument('--rightprop',
                        nargs="?",
                        type=float,
                        default=0.6,
                        help='warm start random exploration right probability')
    parser.add_argument('--rstd',
                        nargs="?",
                        type=float,
                        default=1.0,
                        help='standard deviation of reward')

    args = parser.parse_args()

    num_iter, gamma, n_s, n_a, num_rep = 200, 0.95, 5, 2, args.rep
    episodes = args.episode
    Total_data = args.numdata
    right_prop = args.rightprop
    #print(num_rep, episodes, Total_data, right_prop)

    r = np.zeros(n_s * n_a)
    r_vals = range(1, 4)
    #r_vals = [5./1000]
    r_right = 10.0

    for r0_val in r_vals:
        r[0] = float(r0_val)
        r[-1] = r_right
        r_std = args.rstd
        print("reward standard deviation is {}".format(r_std))
        # r[0] = 10.
        # r[-1] = 0.1
        Q_0 = np.zeros(n_s * n_a)
        V_0 = np.zeros(n_s)
        rou = np.ones(n_s) / n_s
        p = np.zeros(n_s * n_a * n_s)
        print("r[0] and r[-1] are {}, {}".format(r[0], r[-1]))
        #exit()

        p[0 * n_s * n_a + 0 * n_s + 0] = 1.
        p[0 * n_s * n_a + 1 * n_s + 0] = 0.7
        p[0 * n_s * n_a + 1 * n_s + 1] = 0.3
        for i in range(1, (n_s - 1)):
            p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1
            p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1
            p[i * n_a * n_s + 1 * n_s + i] = 0.6
            p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3
        p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1
        p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7
        p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3
        Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s,
                                           n_a)
        V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a)
        print("Q real is {}".format(Q_real))
        s_0 = 2

        Q_approximation = None
        initial_s_dist = "even"
        if initial_s_dist == "even":
            R_real = np.mean(V_real)
            initial_w = np.ones(n_s) / n_s
        cov_bools_Q = np.zeros(n_s * n_a)
        cov_bools_V = np.zeros(n_s)
        cov_bools_R = 0.
        # print("Q real is {}".format(Q_real))
        # print("V real is {}".format(V_real))
        # print("R real is {}".format(R_real))
        CI_lens_Q = []
        CI_lens_V = []
        CI_lens_R = []
        numerical_tol = 1e-6
        S_0 = None

        ## PSRL data parameter specification
        print("total num of data is {}".format(Total_data))
        numdata_1 = Total_data * 3 / 10
        seq_if = False
        numdata_2 = Total_data - numdata_1

        print("# of epsisodes is {}".format(episodes))
        num_datas = [numdata_2 / episodes] * episodes
        #print(num_datas)
        CS_num = 0.
        future_V = np.zeros(num_rep)

        for i in range(num_rep):
            para_cl = parameter_prior(n_s, n_a, s_0)
            all_data = []
            if not seq_if:
                while True:
                    data1 = collect_data_swimmer.collect_data(
                        p,
                        r,
                        numdata_1,
                        s_0,
                        n_s,
                        n_a,
                        right_prop=right_prop,
                        std=r_std)
                    p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(
                        data1, n_s, n_a)
                    Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter,
                                                    gamma, n_s, n_a)
                    V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
                    #print("first stage visiting frequency is {}".format(f_n))
                    if f_n.all() != 0:
                        break
            else:
                data1 = collect_data_swimmer.collect_data(
                    p,
                    r,
                    numdata_2 / episodes,
                    s_0,
                    n_s,
                    n_a,
                    right_prop=right_prop,
                    std=r_std)
            #data =  collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop)
            all_data += data1
            para_cl.update(data1, r_sigma=r_std)
            Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma)
            #print(Q_estimate)
            second_stage_data = []
            for num_data in num_datas:
                data = collect_data_swimmer.collect_data(p,
                                                         r,
                                                         num_data,
                                                         para_cl.s_0,
                                                         n_s,
                                                         n_a,
                                                         Q=Q_estimate,
                                                         epsilon=0,
                                                         std=r_std)
                all_data += data
                second_stage_data += data
                para_cl.update(data, r_sigma=r_std)
                Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma)
                #print(para_cl.pprior)
                #print(para_cl.r_mean)
            #exit()
            #print(Q_estimate)
            #print(para_cl.pprior)
            #print(para_cl.r_mean)
            #transition = np.array([1.] * n_s * (n_s * n_a))
            #for i in range(n_s):
            #    for j in range(n_a):
            #        transition[
            #        (i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] = para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] \
            #                                                                      / np.sum(para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)])
            #r_n = para_cl.r_mean
            #print(r_n)
            #print(transition)
            #Q_estimate = Iterative_Cal_Q.cal_Q_val(transition, Q_0, r_n, num_iter , gamma, n_s, n_a)
            #print(len(all_data))
            p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(
                all_data, n_s, n_a)
            Q_estimate = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter,
                                                   gamma, n_s, n_a)
            V_here = optimize_pfs.policy_val_iteration(Q_estimate, n_s, n_a,
                                                       V_0, num_iter, r, p,
                                                       gamma)
            # print(V_here, V_real)
            future_V[i] = np.dot(rou, V_here)
            FS_bool = optimize_pfs.FS_bool(Q_estimate, V_max_index, n_s, n_a)
            CS_num += FS_bool

            # 5.3
            Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI(
                Q_approximation,
                S_0,
                Total_data,
                s_0,
                num_iter,
                gamma,
                Q_0,
                n_s,
                n_a,
                r,
                p,
                initial_w,
                right_prop,
                data=all_data)
            # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len))
            cov_bool_Q = np.logical_and(
                Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >=
                (Q_n - CI_len_Q - numerical_tol))
            cov_bool_V = np.logical_and(
                V_real <= (V_n + CI_len_V + numerical_tol), V_real >=
                (V_n - CI_len_V - numerical_tol))
            cov_bool_R = np.logical_and(
                R_real <= (R_n + CI_len_R + numerical_tol), R_real >=
                (R_n - CI_len_R - numerical_tol))
            # print(cov_bool_Q)
            # exit()
            # print(cov_bool)
            cov_bools_Q += cov_bool_Q
            cov_bools_V += cov_bool_V
            cov_bools_R += cov_bool_R
            CI_lens_Q.append(CI_len_Q)
            CI_lens_V.append(CI_len_V)
            CI_lens_R.append(CI_len_R)

        PCS = np.float(CS_num) / num_rep
        CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep)
        fv = np.mean(future_V)
        fv_std = np.std(future_V)
        rv = np.dot(rou, V_real)
        diff = rv - fv
        # print(CS_num_naive)
        print("PCS is {}, with CI length {}".format(PCS, CI_len))
        print(
            "future value func is {} with  CI length {}, real value is {}, diff is {}"
            .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff))

        CI_len_Q_mean = np.mean(CI_lens_Q)
        CI_len_V_mean = np.mean(CI_lens_V)
        CI_len_R_mean = np.mean(CI_lens_R)
        CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep)
        CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep)
        CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep)

        cov_rate_Q = np.divide(cov_bools_Q, num_rep)
        cov_rate_V = np.divide(cov_bools_V, num_rep)
        cov_rate_R = np.divide(cov_bools_R, num_rep)
        cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep)
        cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep)
        cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep)
        print("coverage for Q")
        print(cov_rate_Q)
        print(cov_rate_CI_Q)
        print("mean coverage for Q ")
        print(np.mean(cov_rate_Q))
        print(np.mean(cov_rate_CI_Q))
        print("coverage for V")
        print(cov_rate_V)
        print(cov_rate_CI_V)
        print("mean coverage for V")
        print(np.mean(cov_rate_V))
        print(np.mean(cov_rate_CI_V))
        print("coverage for R")
        print(cov_rate_R)
        print(cov_rate_CI_R)
        print("CI len for Q CI {} with ci {}".format(CI_len_Q_mean,
                                                     CI_len_Q_ci))
        print("CI len for V CI {} with ci {}".format(CI_len_V_mean,
                                                     CI_len_V_ci))
        print("CI len for R CI {} with ci {}".format(CI_len_R_mean,
                                                     CI_len_R_ci))