trainable=True) else: np.savetxt("save_model/" + file_name, policy.get_param_values(trainable=True)) load_policy = True # intial setup avg_return = list() eps_list = [] max_rewards = -np.inf num_traj = 0 # loop till done while num_traj <= max_num_traj: # sample snapshot batch of trajectories paths = parallel_sampler.sample_paths_on_trajectories( policy.get_param_values(), snap_bs, traj_length, show_bar=False) paths = paths[:snap_bs] # extract information observations, actions, d_rewards = extract_path(paths, discount) # compute policy gradient v_est = compute_snapshot_grad_est(f_compute_grad, observations, actions, d_rewards) # perform update f_update(v_est[0], v_est[1], v_est[2], v_est[3]) # sample trajectories for evaluating current policy tmp_paths = parallel_sampler.sample_paths_on_trajectories( policy.get_param_values(), num_eval_traj, show_bar=False)
inputs=[observations_var, actions_var, d_rewards_var], outputs=grad) f_update = theano.function( inputs=[eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5], outputs=None, updates=sgd([eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5], params, learning_rate=learning_rate)) alla = [] for i in range(10): if (load_policy): policy.set_param_values(np.loadtxt('policy.txt'), trainable=True) avg_return = np.zeros(n_itr) #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True)) for j in range(n_itr): paths = parallel_sampler.sample_paths_on_trajectories( policy.get_param_values(), N, T, show_bar=False) #baseline.fit(paths) observations = [p["observations"] for p in paths] actions = [p["actions"] for p in paths] d_rewards = [p["rewards"] for p in paths] temp = list() for x in d_rewards: z = list() t = 1 for y in x: z.append(y * t) t *= discount temp.append(np.array(z)) d_rewards = temp s_g = f_train(observations[0], actions[0], d_rewards[0]) for ob, ac, rw in zip(observations[1:], actions[1:], d_rewards[1:]):
snap_policy.set_param_values(np.loadtxt('policy_swimmer.txt'), trainable=True) policy.set_param_values(np.loadtxt('policy_swimmer.txt'), trainable=True) avg_return = list() n_sub_iter = [] rewards_sub_iter = [] rewards_snapshot = [] importance_weights = [] variance_svrg = [] variance_sgd = [] #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True)) j = 0 while j < s_tot - N: paths = parallel_sampler.sample_paths_on_trajectories( snap_policy.get_param_values(), N, T, show_bar=False) #baseline.fit(paths) paths = paths[:N] j += N observations = [p["observations"] for p in paths] actions = [p["actions"] for p in paths] d_rewards = [p["rewards"] for p in paths] temp = list() for x in d_rewards: z = list() t = 1 for y in x: z.append(y * t) t *= discount temp.append(np.array(z)) d_rewards = temp