Exemplos de info em Python, exemplos de dmbrl.misc.logger.info em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: MBExp.py Projeto: yuzhou42/POPLIN

    def __init__(self, params):
        """Initializes class instance.

        Argument:
            params (DotMap): A DotMap containing the following:
                .sim_cfg:
                    .env (gym.env): Environment for this experiment
                    .task_hor (int): Task horizon
                    .stochastic (bool): (optional) If True, agent adds noise to its actions.
                        Must provide noise_std (see below). Defaults to False.
                    .noise_std (float): for stochastic agents, noise of the form N(0, noise_std^2I)
                        will be added.

                .exp_cfg:
                    .ntrain_iters (int): Number of training iterations to be performed.
                    .nrollouts_per_iter (int): (optional) Number of rollouts done between training
                        iterations. Defaults to 1.
                    .ninit_rollouts (int): (optional) Number of initial rollouts. Defaults to 1.
                    .policy (controller): Policy that will be trained.

                .log_cfg:
                    .logdir (str): Parent of directory path where experiment data will be saved.
                        Experiment will be saved in logdir/<date+time of experiment start>
                    .nrecord (int): (optional) Number of rollouts to record for every iteration.
                        Defaults to 0.
                    .neval (int): (optional) Number of rollouts for performance evaluation.
                        Defaults to 1.
        """
        self.env = get_required_argument(params.sim_cfg, "env", "Must provide environment.")
        self.task_hor = get_required_argument(params.sim_cfg, "task_hor", "Must provide task horizon.")
        self._params = params
        params.sim_cfg.misc = copy.copy(params)
        if params.sim_cfg.get("stochastic", False):
            self.agent = Agent(DotMap(
                env=self.env, noisy_actions=True,
                noise_stddev=get_required_argument(
                    params.sim_cfg,
                    "noise_std",
                    "Must provide noise standard deviation in the case of a stochastic environment."
                ),
                params=params
            ))
        else:
            self.agent = Agent(DotMap(env=self.env, noisy_actions=False, params=params))

        self.ntrain_iters = get_required_argument(
            params.exp_cfg, "ntrain_iters", "Must provide number of training iterations."
        )
        self.nrollouts_per_iter = params.exp_cfg.get("nrollouts_per_iter", 1)
        self.ninit_rollouts = params.exp_cfg.get("ninit_rollouts", 1)
        self.policy = get_required_argument(params.exp_cfg, "policy", "Must provide a policy.")

        self.logdir = os.path.join(
            get_required_argument(params.log_cfg, "logdir", "Must provide log parent directory."),
            strftime("%Y-%m-%d--%H:%M:%S", localtime())
        )
        logger.set_file_handler(path=self.logdir)
        logger.info('Starting the experiments')
        self.nrecord = params.log_cfg.get("nrecord", 0)
        self.neval = params.log_cfg.get("neval", 1)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: TFGP.py Projeto: yuzhou42/POPLIN

    def train(self, inputs, targets, *args, **kwargs):
        """Optimizes the parameters of the internal GP model.

        Arguments:
            inputs: (np.ndarray) An array of inputs.
            targets: (np.ndarray) An array of targets.
            num_restarts: (int) The number of times that the optimization of
                the GP will be restarted to obtain a good set of parameters.

        Returns: None.
        """
        perm = np.random.permutation(inputs.shape[0])
        inputs, targets = inputs[perm], targets[perm]
        Z = np.copy(inputs[:self.num_inducing_points])
        if Z.shape[0] < self.num_inducing_points:
            Z = np.concatenate([
                Z,
                np.zeros([self.num_inducing_points - Z.shape[0], Z.shape[1]])
            ])
        self.model.X = inputs
        self.model.Y = targets
        self.model.feature.Z = Z
        with self.sess.as_default():
            self.model.compile()
            logger.info("Optimizing model... ", end="")
            gpflow.train.ScipyOptimizer().minimize(self.model)
            logger.info("Done.")

Exemplo n.º 3

0

Exibir arquivo

    def build_loss(self):
        # build  start_state placeholder and whitening
        self._build_ph()
        self._tensor, self._update_operator = {}, {}

        # construct the input to the forward network, we normalize the state
        # input, and concatenate with the action
        self._tensor['normalized_start_state'] = (
            self._input_ph['start_state'] -
            self._whitening_operator['state_mean']
        ) / self._whitening_operator['state_std']
        self._tensor['net_input'] = self._tensor['normalized_start_state']

        # the output policy of the network
        self._tensor['action'] = self._MLP(self._tensor['net_input'])

        self._input_ph['target_action'] = tf.placeholder(
            tf.float32, [None, self._action_size], name='target_action')

        self._update_operator['loss'] = tf.reduce_mean(
            tf.square(self._input_ph['target_action'] -
                      self._tensor['action']))

        self._update_operator['update_op'] = tf.train.AdamOptimizer(
            learning_rate=self.args.policy_lr, ).minimize(
                self._update_operator['loss'])
        logger.info("policy training learning rate: {}".format(
            self.args.policy_lr))

Exemplo n.º 4

0

Exibir arquivo

    def obtain_solution(self, init_mean, init_var, per, dU, obs=None):
        """Optimizes the cost function using the provided initial candidate distribution

        Arguments:
            init_mean (np.ndarray): The mean of the initial candidate distribution.
            init_var (np.ndarray): The variance of the initial candidate distribution.
        """
        if self.tf_compatible:
            sol, solvar = self.tf_sess.run([self.mean, self.var],
                                           feed_dict={
                                               self.init_mean: init_mean,
                                               self.init_var: init_var
                                           })
        else:
            assert self._params.il_cfg.use_gt_dynamics
            mean, var, t = init_mean, init_var, 0
            X = stats.truncnorm(-2,
                                2,
                                loc=np.zeros_like(mean),
                                scale=np.ones_like(mean))

            cfg = {
                'plan_hor': self._params.opt_cfg.plan_hor,
                'dU': self._params.env.action_space.shape[0]
            }
            while (t < self.max_iters) and np.max(var) > self.epsilon:
                lb_dist, ub_dist = mean - self.lb, self.ub - mean
                constrained_var = np.minimum(
                    np.minimum(np.square(lb_dist / 2), np.square(ub_dist / 2)),
                    var)

                samples = X.rvs(size=[self.popsize, self.sol_dim]) * np.sqrt(
                    constrained_var) + mean
                costs = self._gt_compile_cost(
                    obs, samples, cfg, self._dynamics,
                    self._dynamics._numpy_reward_function)
                costs = np.reshape(costs, [-1])
                elites = samples[np.argsort(costs)][:self.num_elites]

                new_mean = np.mean(elites, axis=0)
                new_var = np.var(elites, axis=0)

                mean = self.alpha * mean + (1 - self.alpha) * new_mean
                var = self.alpha * var + (1 - self.alpha) * new_var
                logger.info('variance of elite: {}'.format(np.var(elites)))
                logger.info('Mean perforamnce: {}'.format(
                    np.mean(costs[np.argsort(costs)][:self.num_elites])))

                t += 1
            sol, solvar = mean, var
        sol = np.reshape(sol, [-1])

        # prev_sol is going to be used next timestep
        prev_sol = self.update_prev_sol(per, dU, sol)
        return sol, prev_sol

Exemplo n.º 5

0

Exibir arquivo

    def optimize_weights(self, data_dict, training_keys):
        
        # data_dict has three level , data_dict[key][set_id][num_data]
        # dim set_id seems wired 
        # see where it feed , double check 
        test_set_id = np.arange(len(data_dict['start_state']))
        num_test_data = int(len(test_set_id) * self.args.pct_testset)
        self._npr.shuffle(test_set_id)
        # check this dim and structure
        test_set = {key: data_dict[key][test_set_id][:num_test_data]
                    for key in training_keys}
        train_set = {key: data_dict[key][test_set_id][num_test_data:]
                     for key in training_keys}
        test_error = old_test_error = np.inf

        # supervised training the behavior (behavior cloning)
        for epoch in range(self.args.policy_epochs):
            # only take key 'state' in train_set
            total_batch_len = len(train_set['start_state'])
            total_batch_inds = np.arange(total_batch_len)
            self._npr.shuffle(total_batch_inds)
            num_minibatch = \
                max(total_batch_len // self.args.minibatch_size, 1)
            train_error = []

            for start in range(num_minibatch):
                start = start * self.args.minibatch_size
                end = min(start + self.args.minibatch_size, total_batch_len)
                batch_inds = total_batch_inds[start: end]
                # data_dict indices become 2 level,but previous three?
                feed_dict = {self._input_ph[key]: data_dict[key][batch_inds]
                             for key in training_keys}

                error, _ = self._session.run(
                    [self._update_operator['loss'],
                     self._update_operator['update_op']], feed_dict=feed_dict
                )
                train_error.append(error)

            # see the test error
            feed_dict = {self._input_ph[key]: test_set[key]
                         for key in training_keys}

            test_error = self._session.run(
                self._update_operator['loss'], feed_dict=feed_dict
            )
            logger.info('Epoch %d; Train Error: %.6f; Test Error: %.6f' %
                        (epoch, np.mean(train_error), test_error))

            if test_error > old_test_error and epoch % 5 == 0:
                # TODO: MAKE A COUNTER HERE
                logger.info('Early stoping')
                break
            else:
                old_test_error = test_error

Exemplo n.º 6

0

Exibir arquivo

def main(env, ctrl_type, ctrl_args, overrides, logdir, args):
    ctrl_args = DotMap(**{key: val for (key, val) in ctrl_args})
    cfg = create_config(env, ctrl_type, ctrl_args, overrides, logdir)
    logger.info('\n' + pprint.pformat(cfg))

    # add the part of popsize
    if ctrl_type == "MPC":
        cfg.exp_cfg.exp_cfg.policy = MPC(cfg.ctrl_cfg)

    cfg.exp_cfg.misc = copy.copy(cfg)
    exp = MBExperiment(cfg.exp_cfg, train_policy=bool(args.train_policy))

    if not os.path.exists(exp.logdir):
        os.makedirs(exp.logdir)
    with open(os.path.join(exp.logdir, "config.txt"), "w") as f:
        f.write(pprint.pformat(cfg.toDict()))

    exp.run_experiment()

Exemplo n.º 7

0

Exibir arquivo

    def build_loss(self):

        self._build_ph()
        self._tensor, self._update_operator = {}, {}

        self._MLP_var_list = self._MLP.get_variable_list()
        self._set_weight = tf_utils.set_network_weights(
            self._session, self._MLP_var_list, ''
        )
        logger.info("policy training learning rate: {}".format(
            self.args.policy_lr)
        )

        self._session.run(tf.variables_initializer(tf.global_variables()))

        # synchronize the two networks if needed
        if self.args.cem_type in ['POPLINP-SEP', 'POPLINP-UNI'] and \
                self.args.training_scheme in ['BC-PR', 'BC-PI']:
            weight_dict = self._get_weight()  # get from MLP
            self._set_weight(weight_dict)     # set the target MLP

Exemplo n.º 8

0

Exibir arquivo

Arquivo: NN.py Projeto: yuzhou42/POPLIN

    def __init__(self, params):
        """Initializes a class instance.

        Arguments:
            params (DotMap): A dotmap of model parameters.
                .name (str): Model name, used for logging/use in variable scopes.
                    Warning: Models with the same name will overwrite each other.
                .num_networks (int): (optional) The number of networks in the ensemble. Defaults to 1.
                    Ignored if model is being loaded.
                .model_dir (str/None): (optional) Path to directory from which model will be loaded, and
                    saved by default. Defaults to None.
                .load_model (bool): (optional) If True, model will be loaded from the model directory,
                    assuming that the files are generated by a model of the same name. Defaults to False.
                .sess (tf.Session/None): The session that this model will use.
                    If None, creates a session with its own associated graph. Defaults to None.
        """
        self.name = get_required_argument(params, 'name', 'Must provide name.')
        self.model_dir = params.get('model_dir', None)

        if params.get('sess', None) is None:
            config = tf.ConfigProto()
            # config.gpu_options.allow_growth = True
            self._sess = tf.Session(config=config)
        else:
            self._sess = params.get('sess')

        # Instance variables
        self.finalized = False
        self.layers, self.decays, self.optvars, self.nonoptvars = [], [], [], []
        self.scaler = None

        # Training objects
        self.optimizer = None
        self.sy_train_in, self.sy_train_targ = None, None
        self.train_op, self.mse_loss = None, None

        # Prediction objects
        self.sy_pred_in2d, self.sy_pred_mean2d_fac = None, None
        self.sy_pred_mean2d, self.sy_pred_var2d = None, None
        self.sy_pred_in3d, self.sy_pred_mean3d_fac = None, None

        if params.get('load_model', False):
            if self.model_dir is None:
                raise ValueError(
                    "Cannot load model without providing model directory.")
            self._load_structure()
            self.num_nets, self.model_loaded = self.layers[
                0].get_ensemble_size(), True
            logger.info("Model loaded from %s." % self.model_dir)
        else:
            self.num_nets = params.get('num_networks', 1)
            self.model_loaded = False

        if self.num_nets == 1:
            logger.info(
                "Created a neural network without variance predictions.")
        else:
            logger.info(
                "Created an ensemble of %d neural networks without variance predictions."
                % (self.num_nets))

Exemplo n.º 9

0

Exibir arquivo

Arquivo: gbp_rs.py Projeto: yuzhou42/POPLIN

    def obtain_solution(self, init_mean, init_var, per, dU, obs=None):
        """Optimizes the cost function provided in setup().
            do gradient based planning

        Arguments:
            init_mean (np.ndarray): The mean of the initial candidate distribution.
            init_var (np.ndarray): The variance of the initial candidate distribution.
        """
        assert self.tf_compatible
        self._print_count = (self._print_count + 1) % 20
        self._print = self._print_count == 0

        # step 1: initialize the action candidates TODO: use init_mean
        self._old_solutions = np.concatenate([
            self.tf_sess.run(self._candidate_solutions)[:, 6:],
            np.random.uniform(self.lb[0], self.ub[0], [self.popsize, 6])
        ],
                                             axis=1)
        self._candidate_solutions.load(self._old_solutions, self.tf_sess)

        avg_cost, min_cost = self.tf_sess.run(
            [self._average_cost, self._min_cost])
        if self._print:
            logger.info('Init   -> Avg_cost: %.3f, Min_cost: %.3f' %
                        (avg_cost, min_cost))

        # step 2: do gradient based planning
        for gbp_iteration in range(self._params.gbp_cfg.plan_iter):
            _, avg_cost, min_cost = self.tf_sess.run(
                [self._planning_optimizer, self._average_cost, self._min_cost])
        avg_cost, min_cost = self.tf_sess.run(
            [self._average_cost, self._min_cost])
        if self._print:
            logger.info('Iter %d > Avg_cost: %.3f, Min_cost: %.3f' %
                        (self._params.gbp_cfg.plan_iter, avg_cost, min_cost))

        sol = self.tf_sess.run(self.solution)
        prev_sol = self.update_prev_sol(per, dU, sol)

        return sol, prev_sol

Exemplo n.º 10

0

Exibir arquivo

    def build_loss(self):

        # build state whitening variables、placeholders,options
        # build start_state placeholders
        self._build_ph()
        self._tensor, self._update_operator = {}, {}

        self._MLP_var_list = self._MLP.get_variable_list()
        # self._set_network_weight defined in self._set_var_list include state whitening
        # here _set_weight only include MLP weight
        self._set_weight = tf_utils.set_network_weights(
            self._session, self._MLP_var_list, '')
        # useless learning rate
        logger.info("policy training learning rate: {}".format(
            self.args.policy_lr))

        self._session.run(tf.variables_initializer(tf.global_variables()))

        # synchronize the two networks if needed
        if self.args.cem_type in ['POPLINP-SEP', 'POPLINP-UNI'] and \
                self.args.training_scheme in ['BC-PR', 'BC-PI']:
            weight_dict = self._get_weight()  # get from MLP
            self._set_weight(weight_dict)  # set the target MLP

Exemplo n.º 11

0

Exibir arquivo

    def __init__(self, params):
        """Creates class instance.

        Arguments:
            params
                .env (gym.env): Environment for which this controller will be used.
                .update_fns (list<func>): A list of functions that will be invoked
                    (possibly with a tensorflow session) every time this controller is reset.
                .ac_ub (np.ndarray): (optional) An array of action upper bounds.
                    Defaults to environment action upper bounds.
                .ac_lb (np.ndarray): (optional) An array of action lower bounds.
                    Defaults to environment action lower bounds.
                .per (int): (optional) Determines how often the action sequence will be optimized.
                    Defaults to 1 (reoptimizes at every call to act()).
                .prop_cfg
                    .model_init_cfg (DotMap): A DotMap of initialization parameters for the model.
                        .model_constructor (func): A function which constructs an instance of this
                            model, given model_init_cfg.
                    .model_train_cfg (dict): (optional) A DotMap of training parameters that will be passed
                        into the model every time is is trained. Defaults to an empty dict.
                    .model_pretrained (bool): (optional) If True, assumes that the model
                        has been trained upon construction.
                    .mode (str): Propagation method. Choose between [E, DS, TSinf, TS1, MM].
                        See https://arxiv.org/abs/1805.12114 for details.
                    .npart (int): Number of particles used for DS, TSinf, TS1, and MM propagation methods.
                    .ign_var (bool): (optional) Determines whether or not variance output of the model
                        will be ignored. Defaults to False unless deterministic propagation is being used.
                    .obs_preproc (func): (optional) A function which modifies observations (in a 2D matrix)
                        before they are passed into the model. Defaults to lambda obs: obs.
                        Note: Must be able to process both NumPy and Tensorflow arrays.
                    .obs_postproc (func): (optional) A function which returns vectors calculated from
                        the previous observations and model predictions, which will then be passed into
                        the provided cost function on observations. Defaults to lambda obs, model_out: model_out.
                        Note: Must be able to process both NumPy and Tensorflow arrays.
                    .obs_postproc2 (func): (optional) A function which takes the vectors returned by
                        obs_postproc and (possibly) modifies it into the predicted observations for the
                        next time step. Defaults to lambda obs: obs.
                        Note: Must be able to process both NumPy and Tensorflow arrays.
                    .targ_proc (func): (optional) A function which takes current observations and next
                        observations and returns the array of targets (so that the model learns the mapping
                        obs -> targ_proc(obs, next_obs)). Defaults to lambda obs, next_obs: next_obs.
                        Note: Only needs to process NumPy arrays.
                .opt_cfg
                    .mode (str): Internal optimizer that will be used. Choose between [CEM, Random].
                    .cfg (DotMap): A map of optimizer initializer parameters.
                    .plan_hor (int): The planning horizon that will be used in optimization.
                    .obs_cost_fn (func): A function which computes the cost of every observation
                        in a 2D matrix.
                        Note: Must be able to process both NumPy and Tensorflow arrays.
                    .ac_cost_fn (func): A function which computes the cost of every action
                        in a 2D matrix.
                .log_cfg
                    .save_all_models (bool): (optional) If True, saves models at every iteration.
                        Defaults to False (only most recent model is saved).
                        Warning: Can be very memory-intensive.
                    .log_traj_preds (bool): (optional) If True, saves the mean and variance of predicted
                        particle trajectories. Defaults to False.
                    .log_particles (bool) (optional) If True, saves all predicted particles trajectories.
                        Defaults to False. Note: Takes precedence over log_traj_preds.
                        Warning: Can be very memory-intensive
        """
        super().__init__(params)
        self._params = params
        self.dO, self.dU = params.env.observation_space.shape[0], params.env.action_space.shape[0]
        self.ac_ub, self.ac_lb = params.env.action_space.high, params.env.action_space.low
        assert np.max(self.ac_lb) == np.min(self.ac_lb)  # just to make sure
        self.ac_ub = np.minimum(self.ac_ub, params.get("ac_ub", self.ac_ub))
        self.ac_lb = np.maximum(self.ac_lb, params.get("ac_lb", self.ac_lb))
        self.update_fns = params.get("update_fns", [])
        self.per = params.get("per", 1)

        self.model = get_required_argument(
            params.prop_cfg.model_init_cfg, "model_constructor", "Must provide a model constructor."
        )(params.prop_cfg.model_init_cfg, misc=params)
        self.model_train_cfg = params.prop_cfg.get("model_train_cfg", {})
        self.prop_mode = get_required_argument(params.prop_cfg, "mode", "Must provide propagation method.")
        self.ign_var = params.prop_cfg.get("ign_var", False) or self.prop_mode == "E"

        self.obs_preproc = params.prop_cfg.get("obs_preproc", lambda obs: obs)
        self.obs_postproc = params.prop_cfg.get("obs_postproc", lambda obs, model_out: model_out)
        self.obs_postproc2 = params.prop_cfg.get("obs_postproc2", lambda next_obs: next_obs)
        self.targ_proc = params.prop_cfg.get("targ_proc", lambda obs, next_obs: next_obs)
        self.npart = get_required_argument(params.prop_cfg, "npart", "Must provide number of particles.")

        self.opt_mode = get_required_argument(params.opt_cfg, "mode", "Must provide optimization method.")
        self.plan_hor = get_required_argument(params.opt_cfg, "plan_hor", "Must provide planning horizon.")
        self.obs_cost_fn = get_required_argument(params.opt_cfg, "obs_cost_fn", "Must provide cost on observations.")
        self.ac_cost_fn = get_required_argument(params.opt_cfg, "ac_cost_fn", "Must provide cost on actions.")
        self.obs_ac_cost_fn = params.prop_cfg.get("obs_ac_cost_fn", None)

        self.save_all_models = params.log_cfg.get("save_all_models", False)
        self.log_traj_preds = params.log_cfg.get("log_traj_preds", False)
        self.log_particles = params.log_cfg.get("log_particles", False)

        # Perform argument checks
        if self.prop_mode not in ["E", "DS", "MM", "TS1", "TSinf", "GT"]:
            raise ValueError("Invalid propagation method.")
        if self.prop_mode in ["TS1", "TSinf"] and self.npart % self.model.num_nets != 0:
            raise ValueError("Number of particles must be a multiple of the ensemble size.")
        if self.prop_mode == "E" and self.npart != 1:
            raise ValueError("Deterministic propagation methods only need one particle.")

        # Create action sequence optimizer
        opt_cfg = params.opt_cfg.get("cfg", {})
        self.optimizer = MPC.optimizers[params.opt_cfg.mode](
            sol_dim=self.plan_hor * self.dU,
            lower_bound=np.tile(self.ac_lb, [self.plan_hor]),
            upper_bound=np.tile(self.ac_ub, [self.plan_hor]),
            tf_session=None if not self.model.is_tf_model else self.model.sess,
            params=params,
            **opt_cfg
        )
        self._policy_network = self.optimizer.get_policy_network()

        # Controller state variables
        self.has_been_trained = params.prop_cfg.get("model_pretrained", False)
        self.ac_buf = np.array([]).reshape(0, self.dU)
        self.prev_sol = np.tile((self.ac_lb + self.ac_ub) / 2, [self.plan_hor])
        self.init_var = np.tile(params.opt_cfg.init_var,
                                [self.ac_lb.shape[0] * self.plan_hor])
        self.train_in = np.array([]).reshape(
            0, self.dU + self.obs_preproc(np.zeros([1, self.dO])).shape[-1]
        )
        self.train_targs = np.array([]).reshape(
            0, self.targ_proc(np.zeros([1, self.dO]),
                              np.zeros([1, self.dO])).shape[-1]
        )
        if self.model.is_tf_model:
            assert not self._params.il_cfg.use_gt_dynamics
            self.sy_cur_obs = tf.Variable(np.zeros(self.dO), dtype=tf.float32)
            self.ac_seq = tf.placeholder(shape=[1, self.plan_hor * self.dU], dtype=tf.float32)
            self.pred_cost, self.pred_traj = self._compile_cost(self.ac_seq, get_pred_trajs=True)

            self.optimizer.set_sy_cur_obs(self.sy_cur_obs)
            # IT IS A HACK. only run when using POPLINA-INIT
            self.optimizer.forward_policy_propose(self._predict_next_obs,
                                                  self.sy_cur_obs)
            self.optimizer.setup(self._compile_cost, True)
            self.model.sess.run(tf.variables_initializer([self.sy_cur_obs]))
            self.prev_sol = self.optimizer.reset_prev_sol(self.prev_sol)  # hack
        else:
            assert self._params.il_cfg.use_gt_dynamics

        logger.info("Created an MPC controller, prop mode %s, %d particles. "
                    % (self.prop_mode, self.npart) + ("Ignoring variance." if self.ign_var else ""))

        if self.save_all_models:
            logger.info("Controller will save all models. (Note: This may be memory-intensive.")
        if self.log_particles:
            logger.info("Controller is logging particle predictions (Note: This may be memory-intensive).")
            self.pred_particles = []
        elif self.log_traj_preds:
            logger.info("Controller is logging trajectory prediction statistics (mean+var).")
            self.pred_means, self.pred_vars = [], []
        else:
            logger.info("Trajectory prediction logging is disabled.")

Exemplo n.º 12

0

Exibir arquivo



import os.path as osp
import sys

sys.path.append(osp.dirname(osp.dirname(osp.realpath(__file__))))
from dmbrl.misc import logger

logger.info("appear twice")

Exemplo n.º 13

0

Exibir arquivo

    def finalize(self, optimizer, optimizer_args=None, *args, **kwargs):
        """Finalizes the network.

        Arguments:
            optimizer: (tf.train.Optimizer) An optimizer class from those available at tf.train.Optimizer.
            optimizer_args: (dict) A dictionary of arguments for the __init__ method of the chosen optimizer.

        Returns: None
        """
        if len(self.layers) == 0:
            raise RuntimeError("Cannot finalize an empty network.")
        if self.finalized:
            raise RuntimeError("Can only finalize a network once.")

        optimizer_args = {} if optimizer_args is None else optimizer_args
        self.optimizer = optimizer(**optimizer_args)

        # Construct all variables.
        with self.sess.as_default():
            with tf.variable_scope(self.name):
                self.scaler = TensorStandardScaler(
                    self.layers[0].get_input_dim())
                for i, layer in enumerate(self.layers):
                    with tf.variable_scope("Layer%i" % i):
                        layer.construct_vars()
                        self.decays.extend(layer.get_decays())
                        self.optvars.extend(layer.get_vars())
        self.nonoptvars.extend(self.scaler.get_vars())

        # Setup training
        with tf.variable_scope(self.name):
            self.optimizer = optimizer(**optimizer_args)
            self.sy_train_in = tf.placeholder(
                dtype=tf.float32,
                shape=[self.num_nets, None, self.layers[0].get_input_dim()],
                name="training_inputs")
            self.sy_train_targ = tf.placeholder(
                dtype=tf.float32,
                shape=[self.num_nets, None, self.layers[-1].get_output_dim()],
                name="training_targets")
            train_loss = tf.reduce_sum(
                self._compile_losses(self.sy_train_in, self.sy_train_targ))
            train_loss += tf.add_n(self.decays)
            self.mse_loss = self._compile_losses(self.sy_train_in,
                                                 self.sy_train_targ)

            self.train_op = self.optimizer.minimize(train_loss,
                                                    var_list=self.optvars)

        # Initialize all variables
        self.sess.run(
            tf.variables_initializer(self.optvars + self.nonoptvars +
                                     self.optimizer.variables()))

        # Setup prediction
        with tf.variable_scope(self.name):
            self.sy_pred_in2d = tf.placeholder(
                dtype=tf.float32,
                shape=[None, self.layers[0].get_input_dim()],
                name="2D_training_inputs")
            self.sy_pred_mean2d_fac = self.create_prediction_tensors(
                self.sy_pred_in2d, factored=True)[0]
            self.sy_pred_mean2d = tf.reduce_mean(self.sy_pred_mean2d_fac,
                                                 axis=0)
            self.sy_pred_var2d = tf.reduce_mean(
                tf.square(self.sy_pred_mean2d_fac - self.sy_pred_mean2d),
                axis=0)

            self.sy_pred_in3d = tf.placeholder(
                dtype=tf.float32,
                shape=[self.num_nets, None, self.layers[0].get_input_dim()],
                name="3D_training_inputs")
            self.sy_pred_mean3d_fac = \
                self.create_prediction_tensors(self.sy_pred_in3d, factored=True)[0]

        # Load model if needed
        if self.model_loaded or self.load_model_values:
            with self.sess.as_default():
                # params_dict = loadmat(os.path.join(self.model_dir, "%s.mat" % self.name))
                # all_vars = self.nonoptvars + self.optvars
                # for i, var in enumerate(all_vars):
                #     var.load(params_dict[str(i)])

                load_path = self.model_dir  # ends with .npz
                logger.info(
                    "Restoring dynamics network weights from {}".format(
                        load_path))

                # Data loaded in npz format
                data = np.load(load_path)

                for var_name in data.files:
                    logger.info(
                        "Loading value to variable {}".format(var_name))
                    tensor = self.sess.graph.get_tensor_by_name(
                        "{}:0".format(var_name))
                    self.sess.run(tf.assign(tensor, data[var_name]))

        self.finalized = True

Exemplo n.º 14

0

Exibir arquivo

    def build_loss(self):
        """ @brief: the MLP is used to generate samples,
            while the target_MLP is used during the training. target_MLP is
            always older than the MLP, and we feed the dataset into target_MLP
            to train MLP.

            After each update, we synchronize target_MLP by copying weights from
            MLP.
        """
        # state whitening_operator is build in when calling _build_ph()
        self._build_ph()
        self._tensor, self._update_operator = {}, {}
        # here build target_state whitening_operator for normalize
        whitening_util.add_whitening_operator(self._whitening_operator,
                                              self._whitening_variable,
                                              'target_state',
                                              self._observation_size)

        # the weight input_ph is always set to 0.0
        self._input_ph['weight'] = tf.placeholder(
            shape=[None, self._MLP.get_weight_size()],
            dtype=tf.float32,
            name='weight_noise')
        # the actual weight generated from the planning
        self._input_ph['target_weight'] = tf.placeholder(
            shape=[None, self._MLP.get_weight_size()],
            dtype=tf.float32,
            name='target_weight_noise')
        self._tensor['net_input'] = (self._input_ph['start_state'] -
                                     self._whitening_operator['state_mean']
                                     ) / self._whitening_operator['state_std']
        self._tensor['target_net_input'] = (
            self._input_ph['start_state'] -
            self._whitening_operator['target_state_mean']
        ) / self._whitening_operator['target_state_std']

        # the output policy of the network
        self._tensor['action'] = self._MLP(self._tensor['net_input'],
                                           self._input_ph['weight'])
        self._tensor['target_action'] = self._target_MLP(
            self._tensor['target_net_input'], self._input_ph['target_weight'])

        # the distillation loss
        self._update_operator['loss'] = tf.reduce_mean(
            tf.square(self._tensor['target_action'] - self._tensor['action']))
        self._target_MLP_var_list = self._target_MLP.get_variable_list()
        self._MLP_var_list = self._MLP.get_variable_list()

        self._update_operator['update_op'] = tf.train.AdamOptimizer(
            learning_rate=self.args.policy_lr, ).minimize(
                self._update_operator['loss'], var_list=self._MLP_var_list)
        logger.info("policy training learning rate: {}".format(
            self.args.policy_lr))

        # synchronize the weights
        self._get_weight = tf_utils.get_network_weights(
            self._session, self._MLP_var_list, 'policy_mlp')
        self._set_weight = tf_utils.set_network_weights(
            self._session, self._target_MLP_var_list, 'target_policy_mlp')

        self._session.run(tf.variables_initializer(tf.global_variables()))

        # synchronize the two networks if needed
        self._set_weight(self._get_weight())  # set the target MLP

Exemplo n.º 15

0

Exibir arquivo

Arquivo: gbp_cem.py Projeto: yuzhou42/POPLIN

    def obtain_solution(self, init_mean, init_var, per, dU, obs=None):
        """Optimizes the cost function using the provided initial candidate distribution

        Arguments:
            init_mean (np.ndarray): The mean of the initial candidate distribution.
            init_var (np.ndarray): The variance of the initial candidate distribution.
        """

        self._print_count = (self._print_count + 1) % 20
        self._print = self._print_count == 0

        if self._gbp_type == 3:
            sol, solvar = self.tf_sess.run([self.mean, self.var],
                                           feed_dict={
                                               self.init_mean: init_mean,
                                               self.init_var: init_var
                                           })

            self._tf_dict['mean']['candidate_solutions'].load(
                sol.reshape([1, -1]), self.tf_sess)

            avg_cost = self.tf_sess.run(
                self._tf_dict['mean']['costs']).reshape([-1])
            if self._print:
                logger.info('Init   -> cost: %.3f' % (avg_cost))

            # step 2: do gradient based planning
            for gbp_iteration in range(self._params.gbp_cfg.plan_iter):
                self.tf_sess.run(self._tf_dict['mean']['planning_optimizer'])

            avg_cost = self.tf_sess.run(
                self._tf_dict['mean']['costs']).reshape([-1])
            if self._print:
                logger.info('AFTER %d iter -> cost: %.3f' %
                            (self._params.gbp_cfg.plan_iter, avg_cost))
            sol = self.tf_sess.run(self._tf_dict['mean']['solution']).reshape(
                [-1])

        elif self._gbp_type == 2:
            '''
                @1 / 2: do the gradient based-planning with in the loop
                    @1: do the planning for all the candidates
                    @2: do the planning only for the top k candidates
            '''
            mean, var, t = init_mean, init_var, 0
            X = stats.truncnorm(-2,
                                2,
                                loc=np.zeros_like(mean),
                                scale=np.ones_like(mean))

            while (t < self.max_iters) and np.max(var) > self.epsilon:
                lb_dist, ub_dist = mean - self.lb, self.ub - mean
                constrained_var = np.minimum(
                    np.minimum(np.square(lb_dist / 2), np.square(ub_dist / 2)),
                    var)

                samples = X.rvs(size=[self.popsize, self.sol_dim]) * \
                    np.sqrt(constrained_var) + mean
                self._tf_dict['popsize']['candidate_solutions'].load(
                    samples.reshape([self.popsize, -1]), self.tf_sess)
                costs = self.tf_sess.run(self._tf_dict['popsize']['costs'])
                sort_id = np.argsort(costs)
                elites = samples[sort_id][:self.num_elites]

                # step 2: do gradient based planning for the top k candidates
                self._tf_dict['top_k']['candidate_solutions'].load(
                    elites.reshape([self.num_elites, -1]), self.tf_sess)

                if self._print:
                    logger.info('Init elites score  -> cost: %f' %
                                np.mean(costs[sort_id][:self.num_elites]))
                for gbp_iteration in range(self._params.gbp_cfg.plan_iter):
                    self.tf_sess.run(
                        self._tf_dict['top_k']['planning_optimizer'])

                if self._print:
                    logger.info('AFTER %d iter -> cost: %f.' %
                                (self._params.gbp_cfg.plan_iter,
                                 np.mean(
                                     self.tf_sess.run(
                                         self._tf_dict['top_k']['costs']))))

                elites = self.tf_sess.run(
                    self._tf_dict['top_k']['candidate_solutions'])

                new_mean = np.mean(elites, axis=0)
                new_var = np.var(elites, axis=0)

                mean = self.alpha * mean + (1 - self.alpha) * new_mean
                var = self.alpha * var + (1 - self.alpha) * new_var

                t += 1
            sol, solvar = mean, var

        elif self._gbp_type == 1:
            '''
                @1 / 2: do the gradient based-planning with in the loop
                    @1: do the planning for all the candidates
                    @2: do the planning only for the top k candidates
            '''

            mean, var, t = init_mean, init_var, 0
            X = stats.truncnorm(-2,
                                2,
                                loc=np.zeros_like(mean),
                                scale=np.ones_like(mean))

            while (t < self.max_iters) and np.max(var) > self.epsilon:
                lb_dist, ub_dist = mean - self.lb, self.ub - mean
                constrained_var = np.minimum(
                    np.minimum(np.square(lb_dist / 2), np.square(ub_dist / 2)),
                    var)

                samples = X.rvs(size=[self.popsize, self.sol_dim]) * \
                    np.sqrt(constrained_var) + mean
                self._tf_dict['popsize']['candidate_solutions'].load(
                    samples.reshape([self.popsize, -1]), self.tf_sess)

                costs = self.tf_sess.run(self._tf_dict['popsize']['costs'])
                sort_id = np.argsort(costs)
                old_elites = samples[sort_id][:self.num_elites]
                old_costs = costs[sort_id][:self.num_elites]
                if self._print:
                    logger.info('Init elites score  -> cost: %f' %
                                np.mean(old_costs))

                # step 2: do gradient based planning
                for gbp_iteration in range(self._params.gbp_cfg.plan_iter):
                    self.tf_sess.run(
                        self._tf_dict['popsize']['planning_optimizer'])

                samples, costs = self.tf_sess.run([
                    self._tf_dict['popsize']['candidate_solutions'],
                    self._tf_dict['popsize']['costs']
                ])

                elites = np.concatenate([samples, old_elites], axis=0)
                costs = np.concatenate([costs, old_costs])
                sort_id = np.argsort(costs)
                elites = elites[sort_id][:self.num_elites]
                costs = costs[sort_id][:self.num_elites]

                if self._print:
                    logger.info(
                        'AFTER %d iter -> cost: %f.' %
                        (self._params.gbp_cfg.plan_iter, np.mean(costs)))

                new_mean = np.mean(elites, axis=0)
                new_var = np.var(elites, axis=0)

                mean = self.alpha * mean + (1 - self.alpha) * new_mean
                var = self.alpha * var + (1 - self.alpha) * new_var

                t += 1
            sol, solvar = mean, var

        else:
            assert False

        prev_sol = self.update_prev_sol(per, dU, sol)
        return sol, prev_sol

Exemplo n.º 16

0

Exibir arquivo

Arquivo: MBExp.py Projeto: baimingc/delay-aware-MBRL

    def run_experiment(self):
        """Perform experiment.
        """
        os.makedirs(self.logdir, exist_ok=True)

        traj_obs, traj_acs, traj_rets, traj_rews = [], [], [], []
        test_traj_obs, test_traj_acs, test_traj_rets = [], [], []
        episode_iter_id = []

        # Perform initial rollouts
        samples = []
        needed_num_steps = self.ninit_rollouts * self.task_hor
        finished_num_steps = 0
        """
        # TODO DEBUG
        needed_num_steps = 64
        self.task_hor = 64
        """
        while True:
            samples.append(
                self.agent.sample(self.task_hor, self.policy, self.delay_hor))
            traj_obs.append(samples[-1]["obs"])
            traj_acs.append(samples[-1]["ac"])
            traj_rews.append(samples[-1]["rewards"])
            finished_num_steps += len(samples[-1]["ac"])
            print(finished_num_steps)

            if finished_num_steps >= needed_num_steps:
                break

        if self.ninit_rollouts > 0:
            self.policy.train([sample["obs"] for sample in samples],
                              [sample["ac"] for sample in samples],
                              [sample["rewards"] for sample in samples])

        # Training loop
        for i in range(self.ntrain_iters):

            logger.info(
                "####################################################################"
            )
            logger.info("Starting training iteration %d." % (i + 1))

            iter_dir = os.path.join(self.logdir, "train_iter%d" % (i + 1))
            os.makedirs(iter_dir, exist_ok=True)

            samples = []
            assert self.nrecord == 0

            needed_num_steps = self.task_hor * \
                (max(self.neval, self.nrollouts_per_iter) - self.nrecord)
            finished_num_steps = 0
            while True:
                samples.append(
                    self.agent.sample(self.task_hor, self.policy,
                                      self.delay_hor))
                finished_num_steps += len(samples[-1]["ac"])

                if finished_num_steps >= needed_num_steps:
                    break
            logger.info("Rewards obtained: {}".format(
                [sample["reward_sum"] for sample in samples[:self.neval]]))
            # test the policy if needed
            if self._params.misc.ctrl_cfg.cem_cfg.test_policy > 0:
                test_data = []
                for _ in range(5):
                    test_data.append(
                        self.agent.sample(self.task_hor,
                                          self.policy,
                                          test_policy=True,
                                          average=False))
                test_traj_rets.extend([
                    np.mean([
                        i_test_data["reward_sum"] for i_test_data in test_data
                    ])
                ])
                test_traj_obs.extend(
                    [i_test_data["obs"] for i_test_data in test_data])
                test_traj_acs.extend(
                    [i_test_data["ac"] for i_test_data in test_data])

            traj_obs.extend([sample["obs"] for sample in samples])
            traj_acs.extend([sample["ac"] for sample in samples])
            traj_rets.extend([sample["reward_sum"] for sample in samples])
            traj_rews.extend([sample["rewards"] for sample in samples])
            episode_iter_id.extend([i] * len(samples))
            samples = samples[:self.nrollouts_per_iter]

            self.policy.dump_logs(self.logdir, iter_dir)
            savemat(
                os.path.join(self.logdir, "logs.mat"), {
                    "observations": traj_obs,
                    "actions": traj_acs,
                    "returns": traj_rets,
                    "rewards": traj_rews,
                    "test_returns": test_traj_rets,
                    "test_obs": test_traj_obs,
                    "test_acs": test_traj_acs,
                    'episode_iter_id': episode_iter_id
                })
            # Delete iteration directory if not used
            if len(os.listdir(iter_dir)) == 0:
                os.rmdir(iter_dir)

            if i < self.ntrain_iters - 1:
                self.policy.train([sample["obs"] for sample in samples],
                                  [sample["ac"] for sample in samples],
                                  [sample["rewards"] for sample in samples])

Exemplo n.º 17

0

Exibir arquivo

Arquivo: gan_policy.py Projeto: yuzhou42/POPLIN

    def optimize_weights(self, data_dict, training_keys):
        self._set_whitening_var(data_dict['whitening_stats'])

        for i_epoch in range(self.args.discriminator_epochs):
            # step 1: generate the GAN noise, the training_ids
            data_dict['weight'] = \
                generate_noise(data_dict['target_weight'], self.args.init_var)

            data_id = np.arange(len(data_dict['start_state']))
            self._npr.shuffle(data_id)
            num_minibatch = max(
                len(data_id) // self.args.discriminator_minibatch_size, 1)
            recorder = {
                'disc_loss': [],
                'entropy': [],
                'policy_loss': [],
                'weight_decay': [],
                'd_true_acc': [],
                'd_fake_acc': []
            }

            for start in range(num_minibatch):
                start_id = start * self.args.discriminator_minibatch_size
                end_id = min(start_id + self.args.discriminator_minibatch_size,
                             len(data_dict['start_state']))
                batch_inds = data_id[start_id:end_id]
                feed_dict = {
                    self._input_ph[key]: data_dict[key][batch_inds]
                    for key in training_keys
                }
                # step 2: optimize the discriminator
                disc_log = self._session.run(
                    {
                        'disc_loss':
                        self._update_operator['discriminator_loss'],
                        'entropy': self._update_operator['entropy_loss'],
                        'd_true_acc':
                        self._update_operator['true_data_accuracy'],
                        'op': self._update_operator['disc_update_op']
                    },
                    feed_dict=feed_dict)

                # step 3: optimize the generator (train the policy network)
                policy_log = self._session.run(
                    {
                        'policy_loss': self._update_operator['policy_loss'],
                        'weight_decay':
                        self._update_operator['weight_decay_loss'],
                        'd_fake_acc':
                        self._update_operator['fake_data_accuracy'],
                        'op': self._update_operator['policy_update_op']
                    },
                    feed_dict=feed_dict)

                policy_log.update(disc_log)
                for key in recorder:
                    recorder[key].append(policy_log[key])

            logger.info("GAN policy epoch: {}".format(i_epoch))
            for key in recorder:
                logger.info("\t[loss] " + key + ": " +
                            "%.6f" % np.mean(recorder[key]))

        # step 4: synchronize the target network
        self._set_weight(self._get_weight())
        whitening_util.copy_whitening_var(data_dict['whitening_stats'],
                                          'state', 'target_state')
        whitening_util.set_whitening_var(self._session,
                                         self._whitening_operator,
                                         data_dict['whitening_stats'],
                                         ['target_state'])

Exemplo n.º 18

0

Exibir arquivo

    def sample(self, horizon, policy, record_fname=None, test_policy=False, average=False):
        """Samples a rollout from the agent.

        Arguments:
            horizon: (int) The length of the rollout to generate from the agent.
            policy: (policy) The policy that the agent will use for actions.
            record_fname: (str/None) The name of the file to which a recording of the rollout
                will be saved. If None, the rollout will not be recorded.

        Returns: (dict) A dictionary containing data from the rollout.
            The keys of the dictionary are 'obs', 'ac', and 'reward_sum'.
        """
        if test_policy:
            logger.info('Testing the policy')
        video_record = record_fname is not None
        recorder = None if not video_record else VideoRecorder(self.env, record_fname)

        times, rewards = [], []
        O, A, reward_sum, done = [self.env.reset()], [], 0, False
        self._debug += 1

        policy.reset()
        # for t in range(20):
        for t in range(horizon):
            if hasattr(self.env, 'render_imitation'):
                self.env.render_imitation()
            if t % 50 == 10 and t > 1:
                logger.info('Current timesteps: %d / %d, average time: %.5f'
                            % (t, horizon, np.mean(times)))
            if video_record:
                recorder.capture_frame()
            start = time.time()
            if test_policy:
                A.append(policy.act(O[t], t, test_policy=test_policy, average=average))
            else:
                A.append(policy.act(O[t], t))
            times.append(time.time() - start)

            if self.noise_stddev is None:
                obs, reward, done, info = self.env.step(A[t])
            else:
                action = A[t] + np.random.normal(loc=0, scale=self.noise_stddev,
                                                 size=[self.dU])
                action = np.minimum(np.maximum(action,
                                               self.env.action_space.low),
                                    self.env.action_space.high)
                obs, reward, done, info = self.env.step(action)
            O.append(obs)
            reward_sum += reward
            rewards.append(reward)
            if done:
                break

        if video_record:
            recorder.capture_frame()
            recorder.close()

        logger.info("Average action selection time: %.4f" % np.mean(times))
        logger.info("Rollout length: %d" % len(A))

        return {
            "obs": np.array(O),
            "ac": np.array(A),
            "reward_sum": reward_sum,
            "rewards": np.array(rewards),
        }

Exemplo n.º 19

0

Exibir arquivo

    def run_experiment(self):
        """Perform experiment.
        """
        os.makedirs(self.logdir, exist_ok=True)

        traj_obs, traj_acs, traj_rets, traj_rews = [], [], [], []
        test_traj_obs, test_traj_acs, test_traj_rets = [], [], []
        episode_iter_id = []

        # Perform initial rollouts
        samples = []
        needed_num_steps = self.ninit_rollouts * self.task_hor
        finished_num_steps = 0
        """
        # TODO DEBUG
        needed_num_steps = 64
        self.task_hor = 64
        """

        # logger.info("Collecting n_init rollout before policy trainning")
        while True:
            samples.append(self.agent.sample(self.task_hor, self.policy))
            traj_obs.append(samples[-1]["obs"])
            traj_acs.append(samples[-1]["ac"])
            traj_rews.append(samples[-1]["rewards"])
            finished_num_steps += len(samples[-1]["ac"])

            if finished_num_steps >= needed_num_steps:
                break

        if self.ninit_rollouts > 0:
            # logger.info("Performing init policy trianing")
            self.policy.train([sample["obs"] for sample in samples],
                              [sample["ac"] for sample in samples],
                              [sample["rewards"] for sample in samples])

        # Training loop
        for i in range(self.ntrain_iters):

            logger.info(
                "####################################################################"
            )
            logger.info("Starting training iteration %d." % (i + 1))

            iter_dir = os.path.join(self.logdir, "train_iter%d" % (i + 1))
            os.makedirs(iter_dir, exist_ok=True)

            samples = []
            assert self.nrecord == 0

            needed_num_steps = self.task_hor * \
                (max(self.neval, self.nrollouts_per_iter) - self.nrecord)
            finished_num_steps = 0
            while True:
                samples.append(self.agent.sample(self.task_hor, self.policy))
                finished_num_steps += len(samples[-1]["ac"])

                if finished_num_steps >= needed_num_steps:
                    break
            logger.info("Rewards obtained: {}".format(
                [sample["reward_sum"] for sample in samples[:self.neval]]))
            # test the policy if needed

            # comment out by ShenShuo
            # passing while config to misc is much too messy
            # we juse comment it out, if need testing policy, we consider a smarter way to pass
            # test_policy arg

            # if self._params.misc.ctrl_cfg.cem_cfg.test_policy > 0:
            #     test_data = []
            #     for _ in range(5):
            #         test_data.append(
            #             self.agent.sample(self.task_hor, self.policy,
            #                               test_policy=True, average=False)
            #         )
            #     test_traj_rets.extend([
            #         np.mean([i_test_data["reward_sum"] for i_test_data in test_data])
            #     ])
            #     test_traj_obs.extend(
            #         [i_test_data["obs"] for i_test_data in test_data]
            #     )
            #     test_traj_acs.extend(
            #         [i_test_data["ac"] for i_test_data in test_data]
            #     )

            traj_obs.extend([sample["obs"] for sample in samples])
            traj_acs.extend([sample["ac"] for sample in samples])
            traj_rets.extend([sample["reward_sum"] for sample in samples])
            traj_rews.extend([sample["rewards"] for sample in samples])
            episode_iter_id.extend([i] * len(samples))
            samples = samples[:self.nrollouts_per_iter]

            self.policy.dump_logs(self.logdir, iter_dir)
            savemat(
                os.path.join(self.logdir, "logs.mat"), {
                    "observations": traj_obs,
                    "actions": traj_acs,
                    "returns": traj_rets,
                    "rewards": traj_rews,
                    "test_returns": test_traj_rets,
                    "test_obs": test_traj_obs,
                    "test_acs": test_traj_acs,
                    'episode_iter_id': episode_iter_id
                })
            # Delete iteration directory if not used
            if len(os.listdir(iter_dir)) == 0:
                os.rmdir(iter_dir)

            # train policy and model together
            if i < self.ntrain_iters - 1:
                self.policy.train([sample["obs"] for sample in samples],
                                  [sample["ac"] for sample in samples],
                                  [sample["rewards"] for sample in samples])

            if i % 10 == 0:
                self.log_model_predictions(i)