Пример #1
0
def _simulation(tmp_env,
                ini_theta,
                alpha,
                beta,
                batch_size,
                trajectory_length=50000,
                gamma=0.95,
                target=None):
    env = gym.make("FrozenLake-v0")
    env.reset()
    current_state = 0

    gq = GreedyGQ_Base(tmp_env,
                       target_policy=target,
                       eta_theta=alpha,
                       eta_omega=beta,
                       gamma=gamma)
    gq.set_theta(ini_theta)
    for i in range(trajectory_length):
        random_action = env.action_space.sample()
        new_state, reward, done, info = env.step(random_action)
        next_state = new_state
        action = random_action

        gq.update(current_state, reward, next_state, action)

        if done:
            env.reset()
            current_state = 0
    return gq.theta[0]
Пример #2
0
def _simulation(env, ini_theta, alpha, beta, batch_size, trajectory_length=50000, gamma=0.95, target=None):
    train_start = time.time()
    env.reset()
    current_state = env.current_state

    gq = GreedyGQ_Base(env, target_policy=target, eta_theta=alpha, eta_omega=beta, gamma=gamma)
    gq.set_theta(ini_theta)

    vrgq = VRGreedyGQ(env, batch_size=batch_size, target_policy=target, eta_theta=alpha, eta_omega=beta, gamma=gamma)
    vrgq.set_theta(ini_theta)

    gq_var = []
    vrgq_var = []
    count = 1
    num_sample_mc = 500
    for i in range(trajectory_length):
        next_state, reward, action = env.step()

        gq.update(current_state, reward, next_state, action)
        vrgq.update(current_state, reward, next_state, action)

        if i >= batch_size:
            estimated_var = 0.0
            tmp_theta = np.copy(vrgq.theta)
            tmp_w = np.copy(vrgq.omega)
            true_grad = evaluate_J(env, tmp_theta)
            for ddd in range(num_sample_mc):
                ss, aa, next_ss, rr = env.sample()
                grad_theta, grad_omega = vrgq.get_grad(ss, rr, next_ss, aa)
                estimated_var += np.sum((grad_theta - true_grad) ** 2) / num_sample_mc
            vrgq_var.append(estimated_var)

        estimated_var = 0.0
        tmp_theta = np.copy(gq.theta[0])
        tmp_w = np.copy(gq.omega)
        true_grad = evaluate_J(env, tmp_theta)
        for ddd in range(num_sample_mc):
            ss, aa, next_ss, rr = env.sample()
            grad_theta, grad_omega = gq._extract_grad_info(ss, rr, next_ss, aa)
            estimated_var += np.sum((grad_theta - true_grad) ** 2) / num_sample_mc
        gq_var.append(estimated_var)

        if (i + 1) % 10000 == 0:
            print("Current iteration:", i + 1, ". Time Spent:", time.time() - train_start)
            train_start = time.time()
        count += 1

        current_state = np.copy(next_state)
    return gq_var, vrgq_var
Пример #3
0
def _easy_simulation(env,
                     alpha,
                     beta,
                     batch_size,
                     trajectory_length=50000,
                     num_simulation=100,
                     gamma=0.95,
                     target=None):
    ini_start = time.time()

    print("Initialization...")
    ini_theta = np.random.normal(scale=3.0, size=env.num_features)
    print("Initialization Completed. Time Spent:", time.time() - ini_start)
    stationary = compute_stationary_dist(env.trans_kernel)

    all_gq_hist_min = []
    all_vrgq_hist_min = []

    all_gq_hist_last = []
    all_vrgq_hist_last = []

    all_gq_hist_avg = []
    all_vrgq_hist_avg = []

    for _ in range(num_simulation):
        env.reset()
        current_state = env.current_state

        estimate1 = GreedyGQ_Base(env,
                                  target_policy=target,
                                  eta_theta=alpha,
                                  eta_omega=beta,
                                  gamma=gamma)
        estimate1.set_theta(ini_theta)

        estimate2 = GreedyGQ_Base(env,
                                  target_policy=target,
                                  eta_theta=alpha,
                                  eta_omega=beta,
                                  gamma=gamma)
        estimate2.set_theta(ini_theta)

        gq = GreedyGQ_Base(env,
                           target_policy=target,
                           eta_theta=alpha,
                           eta_omega=beta,
                           gamma=gamma)
        gq.set_theta(ini_theta)

        vrgq = VRGreedyGQ(env,
                          batch_size=batch_size,
                          target_policy=target,
                          eta_theta=alpha,
                          eta_omega=beta,
                          gamma=gamma)
        vrgq.set_theta(ini_theta)

        print("Start Training. Simulation:", _ + 1)
        train_start = time.time()

        gq_hist_last = [evaluate_J(env, gq.theta)]
        vrgq_hist_last = [evaluate_J(env, gq.theta)]
        gq_hist_min = [evaluate_J(env, gq.theta)]
        vrgq_hist_min = [evaluate_J(env, gq.theta)]
        count = 1
        for i in range(trajectory_length):
            next_state, reward, action = env.step()

            grad_theta_gq, grad_omega_gq = gq.update(current_state, reward,
                                                     next_state, action)
            gq_hist_min.append(
                np.min(gq_hist_min + [evaluate_J(env, gq.theta[0])]))
            gq_hist_last.append(evaluate_J(env, gq.theta[0]))

            grad_theta_vrgq, grad_omega_vrgq = vrgq.update(
                current_state, reward, next_state, action)
            vrgq_hist_min.append(
                np.min(vrgq_hist_min + [evaluate_J(env, vrgq.theta)]))
            vrgq_hist_last.append(evaluate_J(env, vrgq.theta))

            # Estimate the variance
            grad_theta = np.zeros_like(grad_theta_gq)
            grad_omega = np.zeros_like(grad_omega_gq)
            estimate1.set_theta(gq.theta)
            estimate1.set_omega(gq.omega)
            estimate2.set_theta(vrgq.theta)
            estimate2.set_omega(vrgq.omega)
            for sss in env.state_space:
                pass

            current_state = np.copy(next_state)
            if (i + 1) % 10000 == 0:
                print("Current iteration:", i + 1, ". Time Spent:",
                      time.time() - train_start)
                train_start = time.time()
            count += 1
        all_gq_hist_min.append(gq_hist_min)
        all_vrgq_hist_min.append(vrgq_hist_min)
        all_gq_hist_last.append(gq_hist_last)
        all_vrgq_hist_last.append(vrgq_hist_last)
    return all_gq_hist_last, all_vrgq_hist_last
Пример #4
0
def _simulation(env,
                ini_theta,
                alpha,
                beta,
                batch_size,
                trajectory_length=50000,
                gamma=0.95,
                target=None):
    train_start = time.time()
    env.reset()
    current_state = env.current_state

    pg = PolicyGradient(env, eta_theta=alpha, gamma=gamma, is_on_policy=False)
    pg.set_theta(ini_theta)

    gq = GreedyGQ_Base(env,
                       target_policy=target,
                       eta_theta=alpha,
                       eta_omega=beta,
                       gamma=gamma)
    gq.set_theta(ini_theta)

    vrgq = VRGreedyGQ(env,
                      batch_size=batch_size,
                      target_policy=target,
                      eta_theta=alpha,
                      eta_omega=beta,
                      gamma=gamma)
    vrgq.set_theta(ini_theta)

    env2 = gym.make("FrozenLake-v0", is_slippery=False)
    env2.reset()
    current_state2 = 0
    pppp = SoftmaxPolicy(gq.theta, env, 1.0)
    N = 1000
    r = 0
    for iii in range(N):
        random_action = pppp.get_action(current_state2)
        new_state, reward, done, info = env2.step(random_action)
        if done:
            env2.reset()
            current_state2 = 0
        else:
            current_state2 = new_state
        r += reward
    r /= 1000.0

    r_gq = [np.copy(r)]
    r_vrgq = [np.copy(r)]
    r_pg = [np.copy(r)]

    for i in range(trajectory_length):
        next_state, reward, action = env.step()

        pg.update(current_state, action)
        if i % 1000 == 0:
            env2 = gym.make("FrozenLake-v0", is_slippery=False)
            env2.reset()
            current_state2 = 0
            pppp = SoftmaxPolicy(pg.theta, env, 1.0)
            N = 1000
            r = 0
            for iii in range(N):
                random_action = pppp.get_action(current_state2)
                new_state, reward, done, info = env2.step(random_action)
                if done:
                    env2.reset()
                    current_state2 = 0
                else:
                    current_state2 = new_state
                r += reward
            r /= 1000.0
            r_pg.append(np.max(r_pg + [np.copy(r)]))

        gq.update(current_state, reward, next_state, action)
        if i % 1000 == 0:
            env2 = gym.make("FrozenLake-v0", is_slippery=False)
            env2.reset()
            current_state2 = 0
            pppp = SoftmaxPolicy(gq.theta, env, 1.0)
            N = 1000
            r = 0
            for iii in range(N):
                random_action = pppp.get_action(current_state2)
                new_state, reward, done, info = env2.step(random_action)
                if done:
                    env2.reset()
                    current_state2 = 0
                else:
                    current_state2 = new_state
                r += reward
            r /= 1000.0
            r_gq.append(np.max(r_gq + [np.copy(r)]))

        vrgq.update(current_state, reward, next_state, action)
        if i > batch_size:
            if i % 1000 == 0:
                env2 = gym.make("FrozenLake-v0", is_slippery=False)
                env2.reset()
                current_state2 = 0
                pppp = SoftmaxPolicy(vrgq.theta, env, 1.0)
                N = 1000
                r = 0
                for iii in range(N):
                    random_action = pppp.get_action(current_state2)
                    new_state, reward, done, info = env2.step(random_action)
                    if done:
                        env2.reset()
                        current_state2 = 0
                    else:
                        current_state2 = new_state
                    r += reward
                r /= 1000.0
                r_vrgq.append(np.max(r_vrgq + [np.copy(r)]))

        if (i + 1) % 10000 == 0:
            print("Current iteration:", i + 1, ". Time Spent:",
                  time.time() - train_start)
            train_start = time.time()

        current_state = np.copy(next_state)
    print("Done")
    return r_pg, r_gq, r_vrgq
Пример #5
0
def _simulation(env,
                ini_theta,
                alpha,
                beta,
                batch_size,
                trajectory_length=50000,
                gamma=0.95,
                target=None):
    train_start = time.time()
    env.reset()
    current_state = env.current_state

    pg = ActorCritic(env,
                     eta_theta=alpha,
                     eta_omega=beta,
                     gamma=gamma,
                     is_on_policy=False)
    pg.set_theta(ini_theta)

    gq = GreedyGQ_Base(env,
                       target_policy=target,
                       eta_theta=alpha,
                       eta_omega=beta,
                       gamma=gamma)
    gq.set_theta(ini_theta)

    vrgq = VRGreedyGQ(env,
                      batch_size=batch_size,
                      target_policy=target,
                      eta_theta=alpha,
                      eta_omega=beta,
                      gamma=gamma)
    vrgq.set_theta(ini_theta)

    tmpp_env = env.get_copy()
    pppp = SoftmaxPolicy(pg.theta, tmpp_env, 1.0)
    ppp = np.zeros((tmpp_env.num_state, tmpp_env.num_action))
    for ssss in tmpp_env.state_space:
        for aaaa in tmpp_env.action_space:
            ppp[ssss, aaaa] = pppp.policy(aaaa, ssss)
    tmpp_env.set_behavior_policy(ppp)
    stat = compute_stationary_dist(tmpp_env.trans_kernel)
    r = get_total_reward(reward=tmpp_env.reward, stationary_dist=stat)
    r_pg = [np.copy(r)]
    r_gq = [np.copy(r)]
    r_vrgq = [np.copy(r)]

    for i in range(trajectory_length):
        next_state, reward, action = env.step()

        pg.update(current_state, action)

        if i % 10 == 0:
            tmpp_env = env.get_copy()
            pppp = SoftmaxPolicy(pg.theta, tmpp_env, 1.0)
            ppp = np.zeros((tmpp_env.num_state, tmpp_env.num_action))
            for ssss in tmpp_env.state_space:
                for aaaa in tmpp_env.action_space:
                    _tmp = pppp.policy(aaaa, ssss)
                    ppp[ssss, aaaa] = _tmp[0]
            tmpp_env.set_behavior_policy(ppp)
            stat = compute_stationary_dist(tmpp_env.trans_kernel)
            r = get_total_reward(reward=tmpp_env.reward, stationary_dist=stat)
            r_pg.append(np.max(r_pg + [np.copy(r)]))

        gq.update(current_state, reward, next_state, action)
        if i % 10 == 0:
            tmpp_env = env.get_copy()
            pppp = SoftmaxPolicy(gq.theta, tmpp_env, 1.0)
            ppp = np.zeros((tmpp_env.num_state, tmpp_env.num_action))
            for ssss in tmpp_env.state_space:
                for aaaa in tmpp_env.action_space:
                    ppp[ssss, aaaa] = pppp.policy(aaaa, ssss)
            tmpp_env.set_behavior_policy(ppp)
            stat = compute_stationary_dist(tmpp_env.trans_kernel)
            r = get_total_reward(reward=tmpp_env.reward, stationary_dist=stat)
            r_gq.append(np.max(r_gq + [np.copy(r)]))

        vrgq.update(current_state, reward, next_state, action)
        if i % 10 == 0:
            tmpp_env = env.get_copy()
            pppp = SoftmaxPolicy(vrgq.theta, tmpp_env, 1.0)
            ppp = np.zeros((tmpp_env.num_state, tmpp_env.num_action))
            for ssss in tmpp_env.state_space:
                for aaaa in tmpp_env.action_space:
                    ppp[ssss, aaaa] = pppp.policy(aaaa, ssss)
            tmpp_env.set_behavior_policy(ppp)
            stat = compute_stationary_dist(tmpp_env.trans_kernel)
            r = get_total_reward(reward=tmpp_env.reward, stationary_dist=stat)
            if i >= vrgq.batch_size:
                r_vrgq.append(np.max(r_vrgq + [np.copy(r)]))

        if (i + 1) % 10000 == 0:
            print("Current iteration:", i + 1, ". Time Spent:",
                  time.time() - train_start)
            train_start = time.time()

        current_state = np.copy(next_state)
    return r_pg, r_gq, r_vrgq
Пример #6
0
def _simulation(tmp_env,
                ini_theta,
                alpha,
                beta,
                batch_size,
                trajectory_length=50000,
                gamma=0.95,
                target=None):
    env = gym.make("FrozenLake-v0", is_slippery=False)
    env.reset()
    current_state = 0

    gq = GreedyGQ_Base(tmp_env,
                       target_policy=target,
                       eta_theta=alpha,
                       eta_omega=beta,
                       gamma=gamma)
    gq.set_theta(ini_theta)

    vrgq = VRGreedyGQ(tmp_env,
                      batch_size=batch_size,
                      target_policy=target,
                      eta_theta=alpha,
                      eta_omega=beta,
                      gamma=gamma)
    vrgq.set_theta(ini_theta)

    gq_hist_avg = [evaluate_J_obj(tmp_env, gq.theta)]
    vrgq_hist_avg = [evaluate_J_obj(tmp_env, gq.theta)]

    env2 = gym.make("FrozenLake-v0", is_slippery=False)
    env2.reset()
    current_state2 = 0
    pppp = SoftmaxPolicy(gq.theta, tmp_env, 1.0)
    N = 1000
    r = 0
    for iii in range(N):
        random_action = pppp.get_action(current_state2)
        new_state, reward, done, info = env2.step(random_action)
        if done:
            env2.reset()
            current_state2 = 0
        else:
            current_state2 = new_state
        r += reward
    r /= 1000.0

    r_gq = [np.copy(r)]
    r_vrgq = [np.copy(r)]
    for i in range(trajectory_length):
        random_action = env.action_space.sample()
        new_state, reward, done, info = env.step(random_action)
        next_state = new_state
        action = random_action

        gq.update(current_state, reward, next_state, action)
        if i % 1000 == 0:
            # gq_hist_avg.append((len(gq_hist_avg)  * gq_hist_avg[-1] + evaluate_J(tmp_env, gq.theta[0])) /(len(gq_hist_avg) + 1))
            gq_hist_avg.append(
                np.min(gq_hist_avg + [evaluate_J_obj(tmp_env, gq.theta[0])]))

            env2 = gym.make("FrozenLake-v0", is_slippery=False)
            env2.reset()
            current_state2 = 0
            pppp = SoftmaxPolicy(gq.theta, tmp_env, 1.0)
            N = 1000
            r = 0
            for iii in range(N):
                random_action = pppp.get_action(current_state2)
                new_state, reward, done, info = env2.step(random_action)
                if done:
                    env2.reset()
                    current_state2 = 0
                else:
                    current_state2 = new_state
                r += reward
            r /= 1000.0
            r_gq.append(np.max(r_gq + [np.copy(r)]))

        vrgq.update(current_state, reward, next_state, action)
        if i > batch_size:
            if i % 1000 == 0:
                # vrgq_hist_avg.append((len(vrgq_hist_avg)  * vrgq_hist_avg[-1] + evaluate_J(tmp_env, vrgq.theta)) /(len(vrgq_hist_avg) + 1))
                vrgq_hist_avg.append(
                    np.min(vrgq_hist_avg +
                           [evaluate_J_obj(tmp_env, vrgq.theta)]))

                env2 = gym.make("FrozenLake-v0", is_slippery=False)
                env2.reset()
                current_state2 = 0
                pppp = SoftmaxPolicy(vrgq.theta, tmp_env, 1.0)
                N = 1000
                r = 0
                for iii in range(N):
                    random_action = pppp.get_action(current_state2)
                    new_state, reward, done, info = env2.step(random_action)
                    if done:
                        env2.reset()
                        current_state2 = 0
                    else:
                        current_state2 = new_state
                    r += reward
                r /= 1000.0
                r_vrgq.append(np.max(r_vrgq + [np.copy(r)]))

        if done:
            env.reset()
            current_state = 0
    return gq_hist_avg, vrgq_hist_avg, r_gq, r_vrgq
def _simulation(env,
                ini_theta,
                alpha,
                beta,
                batch_size,
                trajectory_length=50000,
                gamma=0.95,
                target=None):
    train_start = time.time()
    env.reset()
    current_state = env.current_state

    gq = GreedyGQ_Base(env,
                       target_policy=target,
                       eta_theta=alpha,
                       eta_omega=beta,
                       gamma=gamma)
    gq.set_theta(ini_theta)

    vrgq = VRGreedyGQ(env,
                      batch_size=batch_size,
                      target_policy=target,
                      eta_theta=alpha,
                      eta_omega=beta,
                      gamma=gamma)
    vrgq.set_theta(ini_theta)

    gq_hist_avg = [evaluate_J(env, gq.theta)]
    vrgq_hist_avg = [evaluate_J(env, gq.theta)]

    tmpp_env = env.get_copy()
    pppp = SoftmaxPolicy(gq.theta, tmpp_env, 1.0)
    ppp = np.zeros((tmpp_env.num_state, tmpp_env.num_action))
    for ssss in tmpp_env.state_space:
        for aaaa in tmpp_env.action_space:
            ppp[ssss, aaaa] = pppp.policy(aaaa, ssss)
    tmpp_env.set_behavior_policy(ppp)
    stat = compute_stationary_dist(tmpp_env.trans_kernel)
    r = get_total_reward(reward=tmpp_env.reward, stationary_dist=stat)
    r_gq = [np.copy(r)]
    r_vrgq = [np.copy(r)]

    aaaaaaa = evaluate_J_obj(env, gq.theta)
    obj_gq = [np.copy(aaaaaaa)]
    obj_vrgq = [np.copy(aaaaaaa)]
    count = 1

    best_gq_obj = np.copy(aaaaaaa)
    gq_best_theta = np.copy(gq.theta)
    best_vrgq_obj = np.copy(aaaaaaa)
    vrgq_best_theta = np.copy(gq.theta)
    for i in range(trajectory_length):
        next_state, reward, action = env.step()

        gq.update(current_state, reward, next_state, action)
        if i % 10 == 0:
            tmpp_env = env.get_copy()
            pppp = SoftmaxPolicy(gq.theta, tmpp_env, 1.0)
            ppp = np.zeros((tmpp_env.num_state, tmpp_env.num_action))
            for ssss in tmpp_env.state_space:
                for aaaa in tmpp_env.action_space:
                    ppp[ssss, aaaa] = pppp.policy(aaaa, ssss)
            tmpp_env.set_behavior_policy(ppp)
            stat = compute_stationary_dist(tmpp_env.trans_kernel)
            r = get_total_reward(reward=tmpp_env.reward, stationary_dist=stat)
            r_gq.append(np.max(r_gq + [np.copy(r)]))

            gq_hist_avg.append(
                np.min(gq_hist_avg + [evaluate_J(env, gq.theta[0])]))

            obj_tmp = evaluate_J_obj(env, gq.theta.transpose())[0]
            if obj_tmp < best_gq_obj:
                best_gq_obj = obj_tmp
                vrgq_best_theta = gq.theta[0]
            if i > 100:
                obj_gq.append(np.min(obj_gq + [np.copy(obj_tmp)]))

            #gq_hist_avg.append( evaluate_J(env, gq.theta[0]) )
            #gq_hist_avg.append(
            #    (len(gq_hist_avg) * gq_hist_avg[-1] + evaluate_J(env, gq.theta[0])) / (len(gq_hist_avg) + 1))

        vrgq.update(current_state, reward, next_state, action)
        if i > batch_size:
            if i % 10 == 0:
                tmpp_env = env.get_copy()
                pppp = SoftmaxPolicy(vrgq.theta, tmpp_env, 1.0)
                ppp = np.zeros((tmpp_env.num_state, tmpp_env.num_action))
                for ssss in tmpp_env.state_space:
                    for aaaa in tmpp_env.action_space:
                        ppp[ssss, aaaa] = pppp.policy(aaaa, ssss)
                tmpp_env.set_behavior_policy(ppp)
                stat = compute_stationary_dist(tmpp_env.trans_kernel)
                r = get_total_reward(reward=tmpp_env.reward,
                                     stationary_dist=stat)
                r_vrgq.append(np.max(r_vrgq + [np.copy(r)]))
                #vrgq_hist_avg.append((len(vrgq_hist_avg)  * vrgq_hist_avg[-1] + evaluate_J(env, vrgq.theta)) /(len(vrgq_hist_avg) + 1))
                vrgq_hist_avg.append(
                    np.min(vrgq_hist_avg + [evaluate_J(env, vrgq.theta)]))
                #vrgq_hist_avg.append( evaluate_J(env, vrgq.theta))

                obj_tmp = evaluate_J_obj(env, vrgq.theta)
                if obj_tmp < best_gq_obj:
                    best_vrgq_obj = obj_tmp
                    vrgq_best_theta = vrgq.theta

                if i > 100:
                    obj_vrgq.append(np.min(obj_vrgq + [np.copy(obj_tmp)]))

        if (i + 1) % 10000 == 0:
            print("Current iteration:", i + 1, ". Time Spent:",
                  time.time() - train_start)
            train_start = time.time()
        count += 1

        current_state = np.copy(next_state)
    return gq_hist_avg, vrgq_hist_avg, r_gq, r_vrgq, obj_gq, obj_vrgq