示例#1
0
                                trainable=True)
    else:
        np.savetxt("save_model/" + file_name,
                   policy.get_param_values(trainable=True))
        load_policy = True

    # intial setup
    avg_return = list()
    eps_list = []
    max_rewards = -np.inf
    num_traj = 0

    # loop till done
    while num_traj <= max_num_traj:
        # sample snapshot batch of trajectories
        paths = parallel_sampler.sample_paths_on_trajectories(
            policy.get_param_values(), snap_bs, traj_length, show_bar=False)
        paths = paths[:snap_bs]

        # extract information
        observations, actions, d_rewards = extract_path(paths, discount)

        # compute policy gradient
        v_est = compute_snapshot_grad_est(f_compute_grad, observations,
                                          actions, d_rewards)

        # perform update
        f_update(v_est[0], v_est[1], v_est[2], v_est[3])

        # sample trajectories for evaluating current policy
        tmp_paths = parallel_sampler.sample_paths_on_trajectories(
            policy.get_param_values(), num_eval_traj, show_bar=False)
示例#2
0
    inputs=[observations_var, actions_var, d_rewards_var], outputs=grad)
f_update = theano.function(
    inputs=[eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5],
    outputs=None,
    updates=sgd([eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5],
                params,
                learning_rate=learning_rate))

alla = []
for i in range(10):
    if (load_policy):
        policy.set_param_values(np.loadtxt('policy.txt'), trainable=True)
    avg_return = np.zeros(n_itr)
    #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True))
    for j in range(n_itr):
        paths = parallel_sampler.sample_paths_on_trajectories(
            policy.get_param_values(), N, T, show_bar=False)
        #baseline.fit(paths)
        observations = [p["observations"] for p in paths]
        actions = [p["actions"] for p in paths]
        d_rewards = [p["rewards"] for p in paths]
        temp = list()
        for x in d_rewards:
            z = list()
            t = 1
            for y in x:
                z.append(y * t)
                t *= discount
            temp.append(np.array(z))
        d_rewards = temp
        s_g = f_train(observations[0], actions[0], d_rewards[0])
        for ob, ac, rw in zip(observations[1:], actions[1:], d_rewards[1:]):
示例#3
0
        snap_policy.set_param_values(np.loadtxt('policy_swimmer.txt'),
                                     trainable=True)
        policy.set_param_values(np.loadtxt('policy_swimmer.txt'),
                                trainable=True)
    avg_return = list()
    n_sub_iter = []
    rewards_sub_iter = []
    rewards_snapshot = []
    importance_weights = []
    variance_svrg = []
    variance_sgd = []

    #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True))
    j = 0
    while j < s_tot - N:
        paths = parallel_sampler.sample_paths_on_trajectories(
            snap_policy.get_param_values(), N, T, show_bar=False)
        #baseline.fit(paths)
        paths = paths[:N]
        j += N
        observations = [p["observations"] for p in paths]
        actions = [p["actions"] for p in paths]
        d_rewards = [p["rewards"] for p in paths]
        temp = list()
        for x in d_rewards:
            z = list()
            t = 1
            for y in x:
                z.append(y * t)
                t *= discount
            temp.append(np.array(z))
        d_rewards = temp