Пример #1
0
observations_var = env.observation_space.new_tensor_variable(
    'observations',
    # It should have 1 extra dimension since we want to represent a list of observations
    extra_dims=1
)
actions_var = env.action_space.new_tensor_variable(
    'actions',
    extra_dims=1
)
d_rewards_var = TT.vector('d_rewards')
importance_weights_var = TT.vector('importance_weight')

# policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
# distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation.
dist_info_vars = policy.dist_info_sym(observations_var)
snap_dist_info_vars = snap_policy.dist_info_sym(observations_var)

surr = TT.sum(- dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) * d_rewards_var)

params = policy.get_params(trainable=True)
snap_params = snap_policy.get_params(trainable=True)

importance_weights = dist.likelihood_ratio_sym_1traj_GPOMDP(actions_var,snap_dist_info_vars,dist_info_vars)

grad = theano.grad(surr, params)

eval_grad1 = TT.matrix('eval_grad0',dtype=grad[0].dtype)
eval_grad2 = TT.vector('eval_grad1',dtype=grad[1].dtype)
eval_grad3 = TT.col('eval_grad3',dtype=grad[2].dtype)
eval_grad4 = TT.vector('eval_grad4',dtype=grad[3].dtype)
Пример #2
0
# Number of iterations
n_itr = 1000
# Set the discount factor for the problem
discount = 0.99
# Learning rate for the gradient update
learning_rate = 0.00005

observations_var = env.observation_space.new_tensor_variable(
    'observations',
    # It should have 1 extra dimension since we want to represent a list of observations
    extra_dims=1)
actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1)
d_rewards_var = TT.vector('d_rewards')
# policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
# distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation.
dist_info_vars = policy.dist_info_sym(observations_var)

surr = TT.sum(
    -dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) *
    d_rewards_var)

params = policy.get_params(trainable=True)

grad = theano.grad(surr, params)

eval_grad1 = TT.matrix('eval_grad0', dtype=grad[0].dtype)
eval_grad2 = TT.vector('eval_grad1', dtype=grad[1].dtype)
eval_grad3 = TT.col('eval_grad3', dtype=grad[2].dtype)
eval_grad4 = TT.vector('eval_grad4', dtype=grad[3].dtype)
eval_grad5 = TT.vector('eval_grad5', dtype=grad[4].dtype)
Пример #3
0
# type for the variable. For instance, for an environment with discrete observations, we might want to use integer
# types if the observations are represented as one-hot vectors.
observations_var = env.observation_space.new_tensor_variable(
    'observations',
    # It should have 1 extra dimension since we want to represent a list of observations
    extra_dims=1
)
actions_var = env.action_space.new_tensor_variable(
    'actions',
    extra_dims=1
)
advantages_var = TT.vector('advantages')

# policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
# distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation.
dist_info_vars = policy.dist_info_sym(observations_var)

# policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing
# distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute
# the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class
# rllab.distributions.DiagonalGaussian
dist = policy.distribution

# Note that we negate the objective, since most optimizers assume a
# minimization problem
surr = - TT.mean(dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var)

# Get the list of trainable parameters.
params = policy.get_params(trainable=True)
grads = theano.grad(surr, params)
def run_task(*_):
    # normalize() makes sure that the actions for the environment lies
    # within the range [-1, 1] (only works for environments with continuous actions)

    # normalize() makes sure that the actions for the environment lies
    # within the range [-1, 1] (only works for environments with continuous actions)
    env = normalize(
        GymEnv(env_name="LunarLanderContinuous-v2", force_reset=True))
    # Initialize a neural network policy with a single hidden layer of 8 hidden units
    policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64, 64))
    # Initialize a linear baseline estimator using default hand-crafted features
    baseline = LinearFeatureBaseline(env.spec)

    # We will collect 100 trajectories per iteration
    N = 3
    # Each trajectory will have at most 100 time steps
    T = 400
    # Number of iterations
    n_itr = 1000
    # Set the discount factor for the problem
    discount = 0.99
    # Learning rate for the gradient update
    learning_rate = 0.001

    # Construct the computation graph

    # Create a Theano variable for storing the observations
    # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However,
    # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data
    # type for the variable. For instance, for an environment with discrete observations, we might want to use integer
    # types if the observations are represented as one-hot vectors.
    observations_var = env.observation_space.new_tensor_variable(
        'observations',
        # It should have 1 extra dimension since we want to represent a list of observations
        extra_dims=1)
    actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1)
    advantages_var = TT.vector('advantages')

    # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
    # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation.
    dist_info_vars = policy.dist_info_sym(observations_var)

    # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing
    # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute
    # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class
    # rllab.distributions.DiagonalGaussian
    dist = policy.distribution

    # Note that we negate the objective, since most optimizers assume a
    # minimization problem
    surr = -TT.mean(
        dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var)

    # Get the list of trainable parameters.
    params = policy.get_params(trainable=True)
    grads = theano.grad(surr, params)

    f_train = theano.function(
        inputs=[observations_var, actions_var, advantages_var],
        outputs=None,
        updates=adam(grads, params, learning_rate=learning_rate),
        allow_input_downcast=True)

    for epoch in range(n_itr):
        logger.push_prefix('epoch #%d | ' % epoch)
        logger.log("Training started")
        paths = []

        for _ in range(N):
            observations = []
            actions = []
            rewards = []

            observation = env.reset()

            for _ in range(T):
                # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains
                # sufficient statistics for the action distribution. It should at least contain entries that would be
                # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym().
                # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is
                # not needed.
                action, _ = policy.get_action(observation)
                # Recall that the last entry of the tuple stores diagnostic information about the environment. In our
                # case it is not needed.
                next_observation, reward, terminal, _ = env.step(action)
                observations.append(observation)
                actions.append(action)
                rewards.append(reward)
                observation = next_observation
                if terminal:
                    # Finish rollout if terminal state reached
                    break

            # We need to compute the empirical return for each time step along the
            # trajectory
            path = dict(
                observations=np.array(observations),
                actions=np.array(actions),
                rewards=np.array(rewards),
            )
            path_baseline = baseline.predict(path)
            advantages = []
            returns = []
            return_so_far = 0
            for t in range(len(rewards) - 1, -1, -1):
                return_so_far = rewards[t] + discount * return_so_far
                returns.append(return_so_far)
                advantage = return_so_far - path_baseline[t]
                advantages.append(advantage)
            # The advantages are stored backwards in time, so we need to revert it
            advantages = np.array(advantages[::-1])
            # And we need to do the same thing for the list of returns
            returns = np.array(returns[::-1])

            advantages = (advantages -
                          np.mean(advantages)) / (np.std(advantages) + 1e-8)

            path["advantages"] = advantages
            path["returns"] = returns

            paths.append(path)

        baseline.fit(paths)

        observations = np.concatenate([p["observations"] for p in paths])
        actions = np.concatenate([p["actions"] for p in paths])
        advantages = np.concatenate([p["advantages"] for p in paths])

        f_train(observations, actions, advantages)
        returns_to_check = [sum(p["rewards"]) for p in paths]
        print('Average Return:', np.mean(returns_to_check))

        ############################################################################
        logger.log("Training finished")
        logger.save_itr_params(epoch, params)
        logger.dump_tabular(with_prefix=False)
        logger.pop_prefix()

        logger.record_tabular('Epoch', epoch)
        logger.record_tabular('Steps', epoch * N * T)
        logger.record_tabular('AverageReturn', np.mean(returns_to_check))
        logger.record_tabular('StdReturn', np.std(returns_to_check))
        logger.record_tabular('MaxReturn', np.max(returns_to_check))
        logger.record_tabular('MinReturn', np.min(returns_to_check))
Пример #5
0
def doit(mode):
    from rllab.envs.box2d.cartpole_env import CartpoleEnv
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from rllab.baselines.zero_baseline import ZeroBaseline
    from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
    from rllab.envs.normalized_env import normalize
    import numpy as np
    import theano
    import theano.tensor as TT
    from lasagne.updates import adam

    # normalize() makes sure that the actions for the environment lies
    # within the range [-1, 1] (only works for environments with continuous actions)
    env = normalize(CartpoleEnv())
    # Initialize a neural network policy with a single hidden layer of 8 hidden units
    policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8,))
    # Initialize a linear baseline estimator using default hand-crafted features
    if "linbaseline" in mode:
        print('linear baseline')
        baseline = LinearFeatureBaseline(env.spec)
    elif "vanilla" in mode:
        print("zero baseline")
        baseline = ZeroBaseline(env.spec)
    elif mode == "batchavg":
        print('batch average baseline')
        # use a zero baseline but subtract the mean of the discounted returns (see below)
        baseline = ZeroBaseline(env.spec)

    if "_ztrans" in mode:
        print('z transform advantages')
    else:
        print('no z transform')


    # We will collect 100 trajectories per iteration
    N = 50
    # Each trajectory will have at most 100 time steps
    T = 50
    # Number of iterations
    n_itr = 50
    # Set the discount factor for the problem
    discount = 0.99
    # Learning rate for the gradient update
    learning_rate = 0.1

    # Construct the computation graph

    # Create a Theano variable for storing the observations
    # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However,
    # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data
    # type for the variable. For instance, for an environment with discrete observations, we might want to use integer
    # types if the observations are represented as one-hot vectors.
    observations_var = env.observation_space.new_tensor_variable(
        'observations',
        # It should have 1 extra dimension since we want to represent a list of observations
        extra_dims=1
    )
    actions_var = env.action_space.new_tensor_variable(
        'actions',
        extra_dims=1
    )
    advantages_var = TT.vector('advantages')

    # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
    # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation.
    dist_info_vars = policy.dist_info_sym(observations_var)

    # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing
    # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute
    # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class
    # rllab.distributions.DiagonalGaussian
    dist = policy.distribution

    # Note that we negate the objective, since most optimizers assume a
    # minimization problem
    surr = - TT.mean(dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var)

    # Get the list of trainable parameters.
    params = policy.get_params(trainable=True)
    grads = theano.grad(surr, params)

    f_train = theano.function(
        inputs=[observations_var, actions_var, advantages_var],
        outputs=None,
        updates=adam(grads, params, learning_rate=learning_rate),
        allow_input_downcast=True
    )

    results = []
    for _ in range(n_itr):

        paths = []

        for _ in range(N):
            observations = []
            actions = []
            rewards = []

            observation = env.reset()

            for _ in range(T):
                # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains
                # sufficient statistics for the action distribution. It should at least contain entries that would be
                # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym().
                # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is
                # not needed.
                action, _ = policy.get_action(observation)
                # Recall that the last entry of the tuple stores diagnostic information about the environment. In our
                # case it is not needed.
                next_observation, reward, terminal, _ = env.step(action)
                observations.append(observation)
                actions.append(action)
                rewards.append(reward)
                observation = next_observation
                if terminal:
                    # Finish rollout if terminal state reached
                    break

            # We need to compute the empirical return for each time step along the
            # trajectory
            path = dict(
                observations=np.array(observations),
                actions=np.array(actions),
                rewards=np.array(rewards),
            )
            path_baseline = baseline.predict(path)
            advantages = []
            returns = []
            return_so_far = 0
            for t in range(len(rewards) - 1, -1, -1):
                return_so_far = rewards[t] + discount * return_so_far
                returns.append(return_so_far)
                advantage = return_so_far - path_baseline[t]
                advantages.append(advantage)
            # The advantages are stored backwards in time, so we need to revert it
            advantages = np.array(advantages[::-1])
            # And we need to do the same thing for the list of returns
            returns = np.array(returns[::-1])

            if "_ztrans" in mode:
                advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)


            path["advantages"] = advantages
            path["returns"] = returns

            paths.append(path)

        baseline.fit(paths)

        observations = np.concatenate([p["observations"] for p in paths])
        actions = np.concatenate([p["actions"] for p in paths])
        advantages = np.concatenate([p["advantages"] for p in paths])


        if mode == 'batchavg':
            # in this case `advantages` up to here are just our good old returns, without baseline or z transformation.
            # now we subtract their mean across all episodes.
            advantages = advantages - np.mean(advantages)


        f_train(observations, actions, advantages)
        avgr =  np.mean([sum(p["rewards"]) for p in paths])
        print(('Average Return:',avgr))
        results.append(avgr)
    return results
Пример #6
0
class Bw_Trans_Model:
    def __init__(self, inputSize, outputSize, env, v, learning_rate, batchsize,
                 which_agent, x_index, y_index, num_fc_layers, depth_fc_layers,
                 print_minimal):

        #init vars
        #self.sess = sess
        self.batchsize = batchsize
        self.which_agent = which_agent
        self.x_index = x_index
        self.y_index = y_index
        self.inputSize = inputSize
        self.outputSize = outputSize
        self.print_minimal = print_minimal

        LOW = -1000000
        HIGH = 1000000
        self.act_dim = env.spec.action_space.flat_dim
        self.obs_dim = env.spec.observation_space.flat_dim
        obs_to_act_spec = env.spec
        obsact_to_obs_spec = EnvSpec(observation_space=Box(
            LOW, HIGH, shape=(self.obs_dim + self.act_dim, )),
                                     action_space=Box(LOW,
                                                      HIGH,
                                                      shape=(self.obs_dim, )))

        #TODO: Think, whether to learn std for backwards policy or not.
        self.bw_act_pol = GaussianMLPPolicy(
            env_spec=obs_to_act_spec,
            hidden_sizes=(64, 64),
            learn_std=v['bw_variance_learn'],
        )

        self.bw_obs_pol = GaussianMLPPolicy(
            env_spec=obsact_to_obs_spec,
            hidden_sizes=(v['bw_model_hidden_size'],
                          v['bw_model_hidden_size']),
            learn_std=v['bw_variance_learn'],
            hidden_nonlinearity=NL.rectify,
        )

        self.obs_in = TT.matrix('obs_in')
        self.obsact_in = TT.matrix('obsact_in')
        self.act_out = TT.matrix('act_out')
        self.diff_out = TT.matrix('diff_out')

        bw_learning_rate = v['bw_learning_rate']
        self.bw_act_dist = self.bw_act_pol.dist_info_sym(self.obs_in)
        self.bw_obs_dist = self.bw_obs_pol.dist_info_sym(self.obsact_in)
        self.bw_act_loss = -TT.sum(
            self.bw_act_pol.distribution.log_likelihood_sym(
                self.act_out, self.bw_act_dist))
        bw_obs_loss = -TT.sum(
            self.bw_obs_pol.distribution.log_likelihood_sym(
                self.diff_out, self.bw_obs_dist))

        bw_act_params = self.bw_act_pol.get_params_internal()
        bw_obs_params = self.bw_obs_pol.get_params_internal()
        #bw_params = bw_act_params + bw_obs_params
        bw_s_to_a_update = lasagne.updates.adam(self.bw_act_loss,
                                                bw_act_params,
                                                learning_rate=bw_learning_rate)
        bw_sa_to_s_update = lasagne.updates.adam(
            bw_obs_loss, bw_obs_params, learning_rate=bw_learning_rate)

        self.bw_act_train = theano.function([self.obs_in, self.act_out],
                                            self.bw_act_loss,
                                            updates=bw_s_to_a_update,
                                            allow_input_downcast=True)
        self.bw_obs_train = theano.function([self.obsact_in, self.diff_out],
                                            bw_obs_loss,
                                            updates=bw_sa_to_s_update,
                                            allow_input_downcast=True)

    def train(self, dataX, dataZ, dataX_new, dataZ_new, nEpoch, save_dir,
              fraction_use_new):

        #init vars
        start = time.time()
        training_loss_list = []
        nData_old = dataX.shape[0]
        num_new_pts = dataX_new.shape[0]

        #how much of new data to use per batch
        if (num_new_pts < (self.batchsize * fraction_use_new)):
            batchsize_new_pts = num_new_pts  #use all of the new ones
        else:
            batchsize_new_pts = int(self.batchsize * fraction_use_new)

        #how much of old data to use per batch
        batchsize_old_pts = int(self.batchsize - batchsize_new_pts)

        #training loop
        for i in range(nEpoch):

            #reset to 0
            avg_loss = 0
            num_batches = 0

            if (batchsize_old_pts > 0):
                print("nothing is going on")

            #train completely from new set
            else:
                for batch in range(
                        int(math.floor(num_new_pts / batchsize_new_pts))):

                    #walk through the shuffled new data
                    dataX_batch = dataX_new[batch *
                                            batchsize_new_pts:(batch + 1) *
                                            batchsize_new_pts, :]
                    dataZ_batch = dataZ_new[batch *
                                            batchsize_new_pts:(batch + 1) *
                                            batchsize_new_pts, :]

                    data_x = dataX_batch[:, 0:self.obs_dim]
                    data_y = dataX_batch[:, self.obs_dim:]

                    loss = self.bw_act_train(data_x, data_y)
                    bw_obs_losses = self.bw_obs_train(dataX_batch, dataZ_batch)

                    training_loss_list.append(loss)
                    avg_loss += bw_obs_losses  #[0]
                    num_batches += 1

                #shuffle new dataset after an epoch (if training only on it)
                p = npr.permutation(dataX_new.shape[0])
                dataX_new = dataX_new[p]
                dataZ_new = dataZ_new[p]

            #save losses after an epoch
            np.save(save_dir + '/training_losses.npy', training_loss_list)
            if (not (self.print_minimal)):
                if ((i % 10) == 0):
                    print("\n=== Epoch {} ===".format(i))
                    print("loss: ", avg_loss / num_batches)

        if (not (self.print_minimal)):
            print("Training set size: ", (nData_old + dataX_new.shape[0]))
            print("Training duration: {:0.2f} s".format(time.time() - start))

        #done
        return (avg_loss / num_batches)  #, old_loss, new_loss

    #multistep prediction using the learned dynamics model at each step
    def do_forward_sim(self, forwardsim_x_true, num_step, many_in_parallel,
                       env_inp, which_agent, mean_x, mean_y, mean_z, std_x,
                       std_y, std_z):

        #init vars
        state_list = []
        action_list = []
        if (many_in_parallel):
            #init vars
            print("Future work..")
        else:
            curr_state = np.copy(
                forwardsim_x_true)  #curr state is of dim NN input
            for i in range(num_step):
                curr_state_preprocessed = curr_state - mean_x
                curr_state_preprocessed = np.nan_to_num(
                    curr_state_preprocessed / std_x)
                action = self.bw_act_pol.get_action(curr_state_preprocessed)[0]
                action_ = action * std_y + mean_y
                state_difference = self.bw_obs_pol.get_action(
                    np.concatenate((curr_state_preprocessed, action)))[0]
                state_differences = (state_difference * std_z) + mean_z
                next_state = curr_state + state_differences
                #copy the state info
                curr_state = np.copy(next_state)
                state_list.append(np.copy(curr_state))
                action_list.append(np.copy(action_))

        return state_list, action_list
Пример #7
0
def run_task(v):
    env, _ = create_env(v["which_agent"])
    fw_learning_rate = v['fw_learning_rate']  # 0.0005!

    yaml_path = os.path.abspath('yaml_files/' + v['yaml_file'] + '.yaml')
    assert (os.path.exists(yaml_path))
    with open(yaml_path, 'r') as f:
        params = yaml.load(f)
    num_fc_layers = params['dyn_model']['num_fc_layers']
    depth_fc_layers = params['dyn_model']['depth_fc_layers']
    batchsize = params['dyn_model']['batchsize']
    lr = params['dyn_model']['lr']
    print_minimal = v['print_minimal']
    nEpoch = params['dyn_model']['nEpoch']
    save_dir = os.path.join(args.save_dir, v['exp_name'])
    inputSize = env.spec.action_space.flat_dim + env.spec.observation_space.flat_dim
    outputSize = env.spec.observation_space.flat_dim

    #Initialize the forward policy
    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))
    #learn_std=False, #v['learn_std'],
    #adaptive_std=False, #v['adaptive_std'],
    #output_gain=1, #v['output_gain'],
    #init_std=1) #v['polic)
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    #Update function for the forward policy (immitation learning loss!)
    fwd_obs = TT.matrix('fwd_obs')
    fwd_act_out = TT.matrix('act_out')
    policy_dist = policy.dist_info_sym(fwd_obs)
    fw_loss = -TT.sum(
        policy.distribution.log_likelihood_sym(fwd_act_out, policy_dist))
    fw_params = policy.get_params_internal()
    fw_update = lasagne.updates.adam(fw_loss,
                                     fw_params,
                                     learning_rate=fw_learning_rate)
    fw_func = theano.function([fwd_obs, fwd_act_out],
                              fw_loss,
                              updates=fw_update,
                              allow_input_downcast=True)
    log_dir = v['yaml_file']
    print('Logging Tensorboard to: %s' % log_dir)
    hist_logger = hist_logging(log_dir)

    optimizer_params = dict(base_eps=1e-5)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        os.makedirs(save_dir + '/losses')
        os.makedirs(save_dir + '/models')
        os.makedirs(save_dir + '/saved_forwardsim')
        os.makedirs(save_dir + '/saved_trajfollow')
        os.makedirs(save_dir + '/training_data')

    x_index, y_index, z_index, yaw_index,\
    joint1_index, joint2_index, frontleg_index,\
    frontshin_index, frontfoot_index, xvel_index, orientation_index = get_indices(v['which_agent'])
    dyn_model = Bw_Trans_Model(inputSize, outputSize, env, v, lr, batchsize,
                               v['which_agent'], x_index, y_index,
                               num_fc_layers, depth_fc_layers, print_minimal)

    for outer_iter in range(1, v['outer_iters']):

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=v["batch_size"],
            max_path_length=v["steps_per_rollout"],
            n_itr=v["num_trpo_iters"],
            discount=0.995,
            optimizer=v["ConjugateGradientOptimizer"](
                hvp_approach=v["FiniteDifferenceHvp"](**optimizer_params)),
            step_size=0.05,
            plot_true=True)
        all_paths = algo.train()

        #Collect the trajectories, using these trajectories which leads to high value states
        # learn a backwards model!
        observations_list = []
        actions_list = []
        rewards_list = []
        returns_list = []
        for indexing in all_paths:
            for paths in indexing:
                observations = []
                actions = []
                returns = []
                reward_for_rollout = 0
                for i_ in range(len(paths['observations'])):
                    #since, we are building backwards model using trajectories,
                    #so, reversing the trajectories.
                    index_ = len(paths['observations']) - i_ - 1
                    observations.append(paths['observations'][index_])
                    actions.append(paths['actions'][index_])
                    returns.append(paths['returns'][index_])
                    reward_for_rollout += paths['rewards'][index_]
                    #if something_ == 1:
                    #    actions_bw.append(path['actions'][::-1])
                    #    observations_bw.append(path['observations'][::-1])
                observations_list.append(observations)
                actions_list.append(actions)
                rewards_list.append(reward_for_rollout)
                returns_list.append(returns)

        hist_logger.log_scalar(save_dir,
                               np.sum(rewards_list) / len(rewards_list),
                               outer_iter * v["num_trpo_iters"])
        selected_observations_list = []
        selected_observations_list_for_state_seletection = []
        selected_actions_list = []
        selected_returns_list = []

        #Figure out how to build the backwards model.
        #Conjecture_1
        #------- Take quantile sample of trajectories which recieves highest cumulative rewards!

        number_of_trajectories = int(
            np.floor(v['top_k_trajectories'] * len(rewards_list) / 100))
        rewards_list_np = np.asarray(rewards_list)
        trajectory_indices = rewards_list_np.argsort(
        )[-number_of_trajectories:][::-1]
        for index_ in range(len(trajectory_indices)):
            selected_observations_list.append(
                observations_list[trajectory_indices[index_]])
            selected_actions_list.append(
                actions_list[trajectory_indices[index_]])

        selected_observations_list_for_state_selection = []
        number_of_trajectories = int(
            np.floor(v['top_k_trajectories_state_selection'] *
                     len(rewards_list) / 100))
        rewards_list_np = np.asarray(rewards_list)
        trajectory_indices = rewards_list_np.argsort(
        )[-number_of_trajectories:][::-1]
        for index_ in range(len(trajectory_indices)):
            selected_observations_list_for_state_seletection.append(
                observations_list[trajectory_indices[index_]])
            selected_returns_list.append(
                returns_list[trajectory_indices[index_]])

        #Figure out from where to start the backwards model.
        #Conjecture_1
        #------ Take quantile sample of high value states, and start the backwards model from them!
        #which amounts to just taking a non parametric buffer of high values states, which should be
        #fine!

        if v['use_good_trajectories'] == 1:
            returns_list = selected_returns_list
            observations_list = selected_observations_list_for_state_selection

        flatten_ret_list = np.asarray(returns_list).flatten()
        flatten_obs_list = np.vstack(np.asarray(observations_list))
        number_of_bw_samples = int(
            np.floor(v['top_k_bw_samples'] * len(flatten_ret_list) / 100))
        samples_indices = flatten_ret_list.argsort(
        )[-number_of_bw_samples:][::-1]
        bw_samples = []
        for bw_index in range(len(samples_indices)):
            bw_samples.append(flatten_obs_list[samples_indices[bw_index]])

        #Not all parts of the state are actually used.
        states = from_observation_to_usablestate(selected_observations_list,
                                                 v["which_agent"], False)
        controls = selected_actions_list
        dataX, dataY = generate_training_data_inputs(states, controls)
        states = np.asarray(states)
        dataZ = generate_training_data_outputs(states, v['which_agent'])

        #every component (i.e. x position) should become mean 0, std 1
        dataX, mean_x, std_x = zero_mean_unit_std(dataX)
        dataY, mean_y, std_y = zero_mean_unit_std(dataY)
        dataZ, mean_z, std_z = zero_mean_unit_std(dataZ)

        ## concatenate state and action, to be used for training dynamics
        inputs = np.concatenate((dataX, dataY), axis=1)
        outputs = np.copy(dataZ)
        assert inputs.shape[0] == outputs.shape[0]

        if v['num_imagination_steps'] == 10:
            nEpoch = 20
        elif v['num_imagination_steps'] == 50:
            nEpoch = 20
        elif v['num_imagination_steps'] == 100:
            nEpoch = 30
        else:
            nEpoch = 20

        nEpoch = v['nEpoch']

        training_loss = dyn_model.train(inputs, outputs, inputs, outputs,
                                        nEpoch, save_dir, 1)
        print("Training Loss for Backwards model", training_loss)

        if v['running_baseline'] == False:
            for goal_ind in range(min(v['fw_iter'], len(bw_samples))):
                #train the backwards model
                #Give inital state, perform rollouts from backwards model.Right now, state is random, but it should
                #be selected from some particular list
                forwardsim_x_true = bw_samples[goal_ind]
                state_list, action_list = dyn_model.do_forward_sim(
                    forwardsim_x_true, v['num_imagination_steps'], False, env,
                    v['which_agent'], mean_x, mean_y, mean_z, std_x, std_y,
                    std_z)

                #Incorporate the backwards trace into model based system.
                fw_func(np.vstack(state_list), np.vstack(action_list))
                #print("Immitation Learning loss", loss)
        else:
            print('running TRPO baseline')