def __init__(self, params): """Initializes class instance. Argument: params (DotMap): A DotMap containing the following: .sim_cfg: .env (gym.env): Environment for this experiment .task_hor (int): Task horizon .stochastic (bool): (optional) If True, agent adds noise to its actions. Must provide noise_std (see below). Defaults to False. .noise_std (float): for stochastic agents, noise of the form N(0, noise_std^2I) will be added. .exp_cfg: .ntrain_iters (int): Number of training iterations to be performed. .nrollouts_per_iter (int): (optional) Number of rollouts done between training iterations. Defaults to 1. .ninit_rollouts (int): (optional) Number of initial rollouts. Defaults to 1. .policy (controller): Policy that will be trained. .log_cfg: .logdir (str): Parent of directory path where experiment data will be saved. Experiment will be saved in logdir/<date+time of experiment start> .nrecord (int): (optional) Number of rollouts to record for every iteration. Defaults to 0. .neval (int): (optional) Number of rollouts for performance evaluation. Defaults to 1. """ self.env = get_required_argument(params.sim_cfg, "env", "Must provide environment.") self.task_hor = get_required_argument(params.sim_cfg, "task_hor", "Must provide task horizon.") self._params = params params.sim_cfg.misc = copy.copy(params) if params.sim_cfg.get("stochastic", False): self.agent = Agent(DotMap( env=self.env, noisy_actions=True, noise_stddev=get_required_argument( params.sim_cfg, "noise_std", "Must provide noise standard deviation in the case of a stochastic environment." ), params=params )) else: self.agent = Agent(DotMap(env=self.env, noisy_actions=False, params=params)) self.ntrain_iters = get_required_argument( params.exp_cfg, "ntrain_iters", "Must provide number of training iterations." ) self.nrollouts_per_iter = params.exp_cfg.get("nrollouts_per_iter", 1) self.ninit_rollouts = params.exp_cfg.get("ninit_rollouts", 1) self.policy = get_required_argument(params.exp_cfg, "policy", "Must provide a policy.") self.logdir = os.path.join( get_required_argument(params.log_cfg, "logdir", "Must provide log parent directory."), strftime("%Y-%m-%d--%H:%M:%S", localtime()) ) logger.set_file_handler(path=self.logdir) logger.info('Starting the experiments') self.nrecord = params.log_cfg.get("nrecord", 0) self.neval = params.log_cfg.get("neval", 1)
def train(self, inputs, targets, *args, **kwargs): """Optimizes the parameters of the internal GP model. Arguments: inputs: (np.ndarray) An array of inputs. targets: (np.ndarray) An array of targets. num_restarts: (int) The number of times that the optimization of the GP will be restarted to obtain a good set of parameters. Returns: None. """ perm = np.random.permutation(inputs.shape[0]) inputs, targets = inputs[perm], targets[perm] Z = np.copy(inputs[:self.num_inducing_points]) if Z.shape[0] < self.num_inducing_points: Z = np.concatenate([ Z, np.zeros([self.num_inducing_points - Z.shape[0], Z.shape[1]]) ]) self.model.X = inputs self.model.Y = targets self.model.feature.Z = Z with self.sess.as_default(): self.model.compile() logger.info("Optimizing model... ", end="") gpflow.train.ScipyOptimizer().minimize(self.model) logger.info("Done.")
def build_loss(self): # build start_state placeholder and whitening self._build_ph() self._tensor, self._update_operator = {}, {} # construct the input to the forward network, we normalize the state # input, and concatenate with the action self._tensor['normalized_start_state'] = ( self._input_ph['start_state'] - self._whitening_operator['state_mean'] ) / self._whitening_operator['state_std'] self._tensor['net_input'] = self._tensor['normalized_start_state'] # the output policy of the network self._tensor['action'] = self._MLP(self._tensor['net_input']) self._input_ph['target_action'] = tf.placeholder( tf.float32, [None, self._action_size], name='target_action') self._update_operator['loss'] = tf.reduce_mean( tf.square(self._input_ph['target_action'] - self._tensor['action'])) self._update_operator['update_op'] = tf.train.AdamOptimizer( learning_rate=self.args.policy_lr, ).minimize( self._update_operator['loss']) logger.info("policy training learning rate: {}".format( self.args.policy_lr))
def obtain_solution(self, init_mean, init_var, per, dU, obs=None): """Optimizes the cost function using the provided initial candidate distribution Arguments: init_mean (np.ndarray): The mean of the initial candidate distribution. init_var (np.ndarray): The variance of the initial candidate distribution. """ if self.tf_compatible: sol, solvar = self.tf_sess.run([self.mean, self.var], feed_dict={ self.init_mean: init_mean, self.init_var: init_var }) else: assert self._params.il_cfg.use_gt_dynamics mean, var, t = init_mean, init_var, 0 X = stats.truncnorm(-2, 2, loc=np.zeros_like(mean), scale=np.ones_like(mean)) cfg = { 'plan_hor': self._params.opt_cfg.plan_hor, 'dU': self._params.env.action_space.shape[0] } while (t < self.max_iters) and np.max(var) > self.epsilon: lb_dist, ub_dist = mean - self.lb, self.ub - mean constrained_var = np.minimum( np.minimum(np.square(lb_dist / 2), np.square(ub_dist / 2)), var) samples = X.rvs(size=[self.popsize, self.sol_dim]) * np.sqrt( constrained_var) + mean costs = self._gt_compile_cost( obs, samples, cfg, self._dynamics, self._dynamics._numpy_reward_function) costs = np.reshape(costs, [-1]) elites = samples[np.argsort(costs)][:self.num_elites] new_mean = np.mean(elites, axis=0) new_var = np.var(elites, axis=0) mean = self.alpha * mean + (1 - self.alpha) * new_mean var = self.alpha * var + (1 - self.alpha) * new_var logger.info('variance of elite: {}'.format(np.var(elites))) logger.info('Mean perforamnce: {}'.format( np.mean(costs[np.argsort(costs)][:self.num_elites]))) t += 1 sol, solvar = mean, var sol = np.reshape(sol, [-1]) # prev_sol is going to be used next timestep prev_sol = self.update_prev_sol(per, dU, sol) return sol, prev_sol
def optimize_weights(self, data_dict, training_keys): # data_dict has three level , data_dict[key][set_id][num_data] # dim set_id seems wired # see where it feed , double check test_set_id = np.arange(len(data_dict['start_state'])) num_test_data = int(len(test_set_id) * self.args.pct_testset) self._npr.shuffle(test_set_id) # check this dim and structure test_set = {key: data_dict[key][test_set_id][:num_test_data] for key in training_keys} train_set = {key: data_dict[key][test_set_id][num_test_data:] for key in training_keys} test_error = old_test_error = np.inf # supervised training the behavior (behavior cloning) for epoch in range(self.args.policy_epochs): # only take key 'state' in train_set total_batch_len = len(train_set['start_state']) total_batch_inds = np.arange(total_batch_len) self._npr.shuffle(total_batch_inds) num_minibatch = \ max(total_batch_len // self.args.minibatch_size, 1) train_error = [] for start in range(num_minibatch): start = start * self.args.minibatch_size end = min(start + self.args.minibatch_size, total_batch_len) batch_inds = total_batch_inds[start: end] # data_dict indices become 2 level,but previous three? feed_dict = {self._input_ph[key]: data_dict[key][batch_inds] for key in training_keys} error, _ = self._session.run( [self._update_operator['loss'], self._update_operator['update_op']], feed_dict=feed_dict ) train_error.append(error) # see the test error feed_dict = {self._input_ph[key]: test_set[key] for key in training_keys} test_error = self._session.run( self._update_operator['loss'], feed_dict=feed_dict ) logger.info('Epoch %d; Train Error: %.6f; Test Error: %.6f' % (epoch, np.mean(train_error), test_error)) if test_error > old_test_error and epoch % 5 == 0: # TODO: MAKE A COUNTER HERE logger.info('Early stoping') break else: old_test_error = test_error
def main(env, ctrl_type, ctrl_args, overrides, logdir, args): ctrl_args = DotMap(**{key: val for (key, val) in ctrl_args}) cfg = create_config(env, ctrl_type, ctrl_args, overrides, logdir) logger.info('\n' + pprint.pformat(cfg)) # add the part of popsize if ctrl_type == "MPC": cfg.exp_cfg.exp_cfg.policy = MPC(cfg.ctrl_cfg) cfg.exp_cfg.misc = copy.copy(cfg) exp = MBExperiment(cfg.exp_cfg, train_policy=bool(args.train_policy)) if not os.path.exists(exp.logdir): os.makedirs(exp.logdir) with open(os.path.join(exp.logdir, "config.txt"), "w") as f: f.write(pprint.pformat(cfg.toDict())) exp.run_experiment()
def build_loss(self): self._build_ph() self._tensor, self._update_operator = {}, {} self._MLP_var_list = self._MLP.get_variable_list() self._set_weight = tf_utils.set_network_weights( self._session, self._MLP_var_list, '' ) logger.info("policy training learning rate: {}".format( self.args.policy_lr) ) self._session.run(tf.variables_initializer(tf.global_variables())) # synchronize the two networks if needed if self.args.cem_type in ['POPLINP-SEP', 'POPLINP-UNI'] and \ self.args.training_scheme in ['BC-PR', 'BC-PI']: weight_dict = self._get_weight() # get from MLP self._set_weight(weight_dict) # set the target MLP
def __init__(self, params): """Initializes a class instance. Arguments: params (DotMap): A dotmap of model parameters. .name (str): Model name, used for logging/use in variable scopes. Warning: Models with the same name will overwrite each other. .num_networks (int): (optional) The number of networks in the ensemble. Defaults to 1. Ignored if model is being loaded. .model_dir (str/None): (optional) Path to directory from which model will be loaded, and saved by default. Defaults to None. .load_model (bool): (optional) If True, model will be loaded from the model directory, assuming that the files are generated by a model of the same name. Defaults to False. .sess (tf.Session/None): The session that this model will use. If None, creates a session with its own associated graph. Defaults to None. """ self.name = get_required_argument(params, 'name', 'Must provide name.') self.model_dir = params.get('model_dir', None) if params.get('sess', None) is None: config = tf.ConfigProto() # config.gpu_options.allow_growth = True self._sess = tf.Session(config=config) else: self._sess = params.get('sess') # Instance variables self.finalized = False self.layers, self.decays, self.optvars, self.nonoptvars = [], [], [], [] self.scaler = None # Training objects self.optimizer = None self.sy_train_in, self.sy_train_targ = None, None self.train_op, self.mse_loss = None, None # Prediction objects self.sy_pred_in2d, self.sy_pred_mean2d_fac = None, None self.sy_pred_mean2d, self.sy_pred_var2d = None, None self.sy_pred_in3d, self.sy_pred_mean3d_fac = None, None if params.get('load_model', False): if self.model_dir is None: raise ValueError( "Cannot load model without providing model directory.") self._load_structure() self.num_nets, self.model_loaded = self.layers[ 0].get_ensemble_size(), True logger.info("Model loaded from %s." % self.model_dir) else: self.num_nets = params.get('num_networks', 1) self.model_loaded = False if self.num_nets == 1: logger.info( "Created a neural network without variance predictions.") else: logger.info( "Created an ensemble of %d neural networks without variance predictions." % (self.num_nets))
def obtain_solution(self, init_mean, init_var, per, dU, obs=None): """Optimizes the cost function provided in setup(). do gradient based planning Arguments: init_mean (np.ndarray): The mean of the initial candidate distribution. init_var (np.ndarray): The variance of the initial candidate distribution. """ assert self.tf_compatible self._print_count = (self._print_count + 1) % 20 self._print = self._print_count == 0 # step 1: initialize the action candidates TODO: use init_mean self._old_solutions = np.concatenate([ self.tf_sess.run(self._candidate_solutions)[:, 6:], np.random.uniform(self.lb[0], self.ub[0], [self.popsize, 6]) ], axis=1) self._candidate_solutions.load(self._old_solutions, self.tf_sess) avg_cost, min_cost = self.tf_sess.run( [self._average_cost, self._min_cost]) if self._print: logger.info('Init -> Avg_cost: %.3f, Min_cost: %.3f' % (avg_cost, min_cost)) # step 2: do gradient based planning for gbp_iteration in range(self._params.gbp_cfg.plan_iter): _, avg_cost, min_cost = self.tf_sess.run( [self._planning_optimizer, self._average_cost, self._min_cost]) avg_cost, min_cost = self.tf_sess.run( [self._average_cost, self._min_cost]) if self._print: logger.info('Iter %d > Avg_cost: %.3f, Min_cost: %.3f' % (self._params.gbp_cfg.plan_iter, avg_cost, min_cost)) sol = self.tf_sess.run(self.solution) prev_sol = self.update_prev_sol(per, dU, sol) return sol, prev_sol
def build_loss(self): # build state whitening variables、placeholders,options # build start_state placeholders self._build_ph() self._tensor, self._update_operator = {}, {} self._MLP_var_list = self._MLP.get_variable_list() # self._set_network_weight defined in self._set_var_list include state whitening # here _set_weight only include MLP weight self._set_weight = tf_utils.set_network_weights( self._session, self._MLP_var_list, '') # useless learning rate logger.info("policy training learning rate: {}".format( self.args.policy_lr)) self._session.run(tf.variables_initializer(tf.global_variables())) # synchronize the two networks if needed if self.args.cem_type in ['POPLINP-SEP', 'POPLINP-UNI'] and \ self.args.training_scheme in ['BC-PR', 'BC-PI']: weight_dict = self._get_weight() # get from MLP self._set_weight(weight_dict) # set the target MLP
def __init__(self, params): """Creates class instance. Arguments: params .env (gym.env): Environment for which this controller will be used. .update_fns (list<func>): A list of functions that will be invoked (possibly with a tensorflow session) every time this controller is reset. .ac_ub (np.ndarray): (optional) An array of action upper bounds. Defaults to environment action upper bounds. .ac_lb (np.ndarray): (optional) An array of action lower bounds. Defaults to environment action lower bounds. .per (int): (optional) Determines how often the action sequence will be optimized. Defaults to 1 (reoptimizes at every call to act()). .prop_cfg .model_init_cfg (DotMap): A DotMap of initialization parameters for the model. .model_constructor (func): A function which constructs an instance of this model, given model_init_cfg. .model_train_cfg (dict): (optional) A DotMap of training parameters that will be passed into the model every time is is trained. Defaults to an empty dict. .model_pretrained (bool): (optional) If True, assumes that the model has been trained upon construction. .mode (str): Propagation method. Choose between [E, DS, TSinf, TS1, MM]. See https://arxiv.org/abs/1805.12114 for details. .npart (int): Number of particles used for DS, TSinf, TS1, and MM propagation methods. .ign_var (bool): (optional) Determines whether or not variance output of the model will be ignored. Defaults to False unless deterministic propagation is being used. .obs_preproc (func): (optional) A function which modifies observations (in a 2D matrix) before they are passed into the model. Defaults to lambda obs: obs. Note: Must be able to process both NumPy and Tensorflow arrays. .obs_postproc (func): (optional) A function which returns vectors calculated from the previous observations and model predictions, which will then be passed into the provided cost function on observations. Defaults to lambda obs, model_out: model_out. Note: Must be able to process both NumPy and Tensorflow arrays. .obs_postproc2 (func): (optional) A function which takes the vectors returned by obs_postproc and (possibly) modifies it into the predicted observations for the next time step. Defaults to lambda obs: obs. Note: Must be able to process both NumPy and Tensorflow arrays. .targ_proc (func): (optional) A function which takes current observations and next observations and returns the array of targets (so that the model learns the mapping obs -> targ_proc(obs, next_obs)). Defaults to lambda obs, next_obs: next_obs. Note: Only needs to process NumPy arrays. .opt_cfg .mode (str): Internal optimizer that will be used. Choose between [CEM, Random]. .cfg (DotMap): A map of optimizer initializer parameters. .plan_hor (int): The planning horizon that will be used in optimization. .obs_cost_fn (func): A function which computes the cost of every observation in a 2D matrix. Note: Must be able to process both NumPy and Tensorflow arrays. .ac_cost_fn (func): A function which computes the cost of every action in a 2D matrix. .log_cfg .save_all_models (bool): (optional) If True, saves models at every iteration. Defaults to False (only most recent model is saved). Warning: Can be very memory-intensive. .log_traj_preds (bool): (optional) If True, saves the mean and variance of predicted particle trajectories. Defaults to False. .log_particles (bool) (optional) If True, saves all predicted particles trajectories. Defaults to False. Note: Takes precedence over log_traj_preds. Warning: Can be very memory-intensive """ super().__init__(params) self._params = params self.dO, self.dU = params.env.observation_space.shape[0], params.env.action_space.shape[0] self.ac_ub, self.ac_lb = params.env.action_space.high, params.env.action_space.low assert np.max(self.ac_lb) == np.min(self.ac_lb) # just to make sure self.ac_ub = np.minimum(self.ac_ub, params.get("ac_ub", self.ac_ub)) self.ac_lb = np.maximum(self.ac_lb, params.get("ac_lb", self.ac_lb)) self.update_fns = params.get("update_fns", []) self.per = params.get("per", 1) self.model = get_required_argument( params.prop_cfg.model_init_cfg, "model_constructor", "Must provide a model constructor." )(params.prop_cfg.model_init_cfg, misc=params) self.model_train_cfg = params.prop_cfg.get("model_train_cfg", {}) self.prop_mode = get_required_argument(params.prop_cfg, "mode", "Must provide propagation method.") self.ign_var = params.prop_cfg.get("ign_var", False) or self.prop_mode == "E" self.obs_preproc = params.prop_cfg.get("obs_preproc", lambda obs: obs) self.obs_postproc = params.prop_cfg.get("obs_postproc", lambda obs, model_out: model_out) self.obs_postproc2 = params.prop_cfg.get("obs_postproc2", lambda next_obs: next_obs) self.targ_proc = params.prop_cfg.get("targ_proc", lambda obs, next_obs: next_obs) self.npart = get_required_argument(params.prop_cfg, "npart", "Must provide number of particles.") self.opt_mode = get_required_argument(params.opt_cfg, "mode", "Must provide optimization method.") self.plan_hor = get_required_argument(params.opt_cfg, "plan_hor", "Must provide planning horizon.") self.obs_cost_fn = get_required_argument(params.opt_cfg, "obs_cost_fn", "Must provide cost on observations.") self.ac_cost_fn = get_required_argument(params.opt_cfg, "ac_cost_fn", "Must provide cost on actions.") self.obs_ac_cost_fn = params.prop_cfg.get("obs_ac_cost_fn", None) self.save_all_models = params.log_cfg.get("save_all_models", False) self.log_traj_preds = params.log_cfg.get("log_traj_preds", False) self.log_particles = params.log_cfg.get("log_particles", False) # Perform argument checks if self.prop_mode not in ["E", "DS", "MM", "TS1", "TSinf", "GT"]: raise ValueError("Invalid propagation method.") if self.prop_mode in ["TS1", "TSinf"] and self.npart % self.model.num_nets != 0: raise ValueError("Number of particles must be a multiple of the ensemble size.") if self.prop_mode == "E" and self.npart != 1: raise ValueError("Deterministic propagation methods only need one particle.") # Create action sequence optimizer opt_cfg = params.opt_cfg.get("cfg", {}) self.optimizer = MPC.optimizers[params.opt_cfg.mode]( sol_dim=self.plan_hor * self.dU, lower_bound=np.tile(self.ac_lb, [self.plan_hor]), upper_bound=np.tile(self.ac_ub, [self.plan_hor]), tf_session=None if not self.model.is_tf_model else self.model.sess, params=params, **opt_cfg ) self._policy_network = self.optimizer.get_policy_network() # Controller state variables self.has_been_trained = params.prop_cfg.get("model_pretrained", False) self.ac_buf = np.array([]).reshape(0, self.dU) self.prev_sol = np.tile((self.ac_lb + self.ac_ub) / 2, [self.plan_hor]) self.init_var = np.tile(params.opt_cfg.init_var, [self.ac_lb.shape[0] * self.plan_hor]) self.train_in = np.array([]).reshape( 0, self.dU + self.obs_preproc(np.zeros([1, self.dO])).shape[-1] ) self.train_targs = np.array([]).reshape( 0, self.targ_proc(np.zeros([1, self.dO]), np.zeros([1, self.dO])).shape[-1] ) if self.model.is_tf_model: assert not self._params.il_cfg.use_gt_dynamics self.sy_cur_obs = tf.Variable(np.zeros(self.dO), dtype=tf.float32) self.ac_seq = tf.placeholder(shape=[1, self.plan_hor * self.dU], dtype=tf.float32) self.pred_cost, self.pred_traj = self._compile_cost(self.ac_seq, get_pred_trajs=True) self.optimizer.set_sy_cur_obs(self.sy_cur_obs) # IT IS A HACK. only run when using POPLINA-INIT self.optimizer.forward_policy_propose(self._predict_next_obs, self.sy_cur_obs) self.optimizer.setup(self._compile_cost, True) self.model.sess.run(tf.variables_initializer([self.sy_cur_obs])) self.prev_sol = self.optimizer.reset_prev_sol(self.prev_sol) # hack else: assert self._params.il_cfg.use_gt_dynamics logger.info("Created an MPC controller, prop mode %s, %d particles. " % (self.prop_mode, self.npart) + ("Ignoring variance." if self.ign_var else "")) if self.save_all_models: logger.info("Controller will save all models. (Note: This may be memory-intensive.") if self.log_particles: logger.info("Controller is logging particle predictions (Note: This may be memory-intensive).") self.pred_particles = [] elif self.log_traj_preds: logger.info("Controller is logging trajectory prediction statistics (mean+var).") self.pred_means, self.pred_vars = [], [] else: logger.info("Trajectory prediction logging is disabled.")
import os.path as osp import sys sys.path.append(osp.dirname(osp.dirname(osp.realpath(__file__)))) from dmbrl.misc import logger logger.info("appear twice")
def finalize(self, optimizer, optimizer_args=None, *args, **kwargs): """Finalizes the network. Arguments: optimizer: (tf.train.Optimizer) An optimizer class from those available at tf.train.Optimizer. optimizer_args: (dict) A dictionary of arguments for the __init__ method of the chosen optimizer. Returns: None """ if len(self.layers) == 0: raise RuntimeError("Cannot finalize an empty network.") if self.finalized: raise RuntimeError("Can only finalize a network once.") optimizer_args = {} if optimizer_args is None else optimizer_args self.optimizer = optimizer(**optimizer_args) # Construct all variables. with self.sess.as_default(): with tf.variable_scope(self.name): self.scaler = TensorStandardScaler( self.layers[0].get_input_dim()) for i, layer in enumerate(self.layers): with tf.variable_scope("Layer%i" % i): layer.construct_vars() self.decays.extend(layer.get_decays()) self.optvars.extend(layer.get_vars()) self.nonoptvars.extend(self.scaler.get_vars()) # Setup training with tf.variable_scope(self.name): self.optimizer = optimizer(**optimizer_args) self.sy_train_in = tf.placeholder( dtype=tf.float32, shape=[self.num_nets, None, self.layers[0].get_input_dim()], name="training_inputs") self.sy_train_targ = tf.placeholder( dtype=tf.float32, shape=[self.num_nets, None, self.layers[-1].get_output_dim()], name="training_targets") train_loss = tf.reduce_sum( self._compile_losses(self.sy_train_in, self.sy_train_targ)) train_loss += tf.add_n(self.decays) self.mse_loss = self._compile_losses(self.sy_train_in, self.sy_train_targ) self.train_op = self.optimizer.minimize(train_loss, var_list=self.optvars) # Initialize all variables self.sess.run( tf.variables_initializer(self.optvars + self.nonoptvars + self.optimizer.variables())) # Setup prediction with tf.variable_scope(self.name): self.sy_pred_in2d = tf.placeholder( dtype=tf.float32, shape=[None, self.layers[0].get_input_dim()], name="2D_training_inputs") self.sy_pred_mean2d_fac = self.create_prediction_tensors( self.sy_pred_in2d, factored=True)[0] self.sy_pred_mean2d = tf.reduce_mean(self.sy_pred_mean2d_fac, axis=0) self.sy_pred_var2d = tf.reduce_mean( tf.square(self.sy_pred_mean2d_fac - self.sy_pred_mean2d), axis=0) self.sy_pred_in3d = tf.placeholder( dtype=tf.float32, shape=[self.num_nets, None, self.layers[0].get_input_dim()], name="3D_training_inputs") self.sy_pred_mean3d_fac = \ self.create_prediction_tensors(self.sy_pred_in3d, factored=True)[0] # Load model if needed if self.model_loaded or self.load_model_values: with self.sess.as_default(): # params_dict = loadmat(os.path.join(self.model_dir, "%s.mat" % self.name)) # all_vars = self.nonoptvars + self.optvars # for i, var in enumerate(all_vars): # var.load(params_dict[str(i)]) load_path = self.model_dir # ends with .npz logger.info( "Restoring dynamics network weights from {}".format( load_path)) # Data loaded in npz format data = np.load(load_path) for var_name in data.files: logger.info( "Loading value to variable {}".format(var_name)) tensor = self.sess.graph.get_tensor_by_name( "{}:0".format(var_name)) self.sess.run(tf.assign(tensor, data[var_name])) self.finalized = True
def build_loss(self): """ @brief: the MLP is used to generate samples, while the target_MLP is used during the training. target_MLP is always older than the MLP, and we feed the dataset into target_MLP to train MLP. After each update, we synchronize target_MLP by copying weights from MLP. """ # state whitening_operator is build in when calling _build_ph() self._build_ph() self._tensor, self._update_operator = {}, {} # here build target_state whitening_operator for normalize whitening_util.add_whitening_operator(self._whitening_operator, self._whitening_variable, 'target_state', self._observation_size) # the weight input_ph is always set to 0.0 self._input_ph['weight'] = tf.placeholder( shape=[None, self._MLP.get_weight_size()], dtype=tf.float32, name='weight_noise') # the actual weight generated from the planning self._input_ph['target_weight'] = tf.placeholder( shape=[None, self._MLP.get_weight_size()], dtype=tf.float32, name='target_weight_noise') self._tensor['net_input'] = (self._input_ph['start_state'] - self._whitening_operator['state_mean'] ) / self._whitening_operator['state_std'] self._tensor['target_net_input'] = ( self._input_ph['start_state'] - self._whitening_operator['target_state_mean'] ) / self._whitening_operator['target_state_std'] # the output policy of the network self._tensor['action'] = self._MLP(self._tensor['net_input'], self._input_ph['weight']) self._tensor['target_action'] = self._target_MLP( self._tensor['target_net_input'], self._input_ph['target_weight']) # the distillation loss self._update_operator['loss'] = tf.reduce_mean( tf.square(self._tensor['target_action'] - self._tensor['action'])) self._target_MLP_var_list = self._target_MLP.get_variable_list() self._MLP_var_list = self._MLP.get_variable_list() self._update_operator['update_op'] = tf.train.AdamOptimizer( learning_rate=self.args.policy_lr, ).minimize( self._update_operator['loss'], var_list=self._MLP_var_list) logger.info("policy training learning rate: {}".format( self.args.policy_lr)) # synchronize the weights self._get_weight = tf_utils.get_network_weights( self._session, self._MLP_var_list, 'policy_mlp') self._set_weight = tf_utils.set_network_weights( self._session, self._target_MLP_var_list, 'target_policy_mlp') self._session.run(tf.variables_initializer(tf.global_variables())) # synchronize the two networks if needed self._set_weight(self._get_weight()) # set the target MLP
def obtain_solution(self, init_mean, init_var, per, dU, obs=None): """Optimizes the cost function using the provided initial candidate distribution Arguments: init_mean (np.ndarray): The mean of the initial candidate distribution. init_var (np.ndarray): The variance of the initial candidate distribution. """ self._print_count = (self._print_count + 1) % 20 self._print = self._print_count == 0 if self._gbp_type == 3: sol, solvar = self.tf_sess.run([self.mean, self.var], feed_dict={ self.init_mean: init_mean, self.init_var: init_var }) self._tf_dict['mean']['candidate_solutions'].load( sol.reshape([1, -1]), self.tf_sess) avg_cost = self.tf_sess.run( self._tf_dict['mean']['costs']).reshape([-1]) if self._print: logger.info('Init -> cost: %.3f' % (avg_cost)) # step 2: do gradient based planning for gbp_iteration in range(self._params.gbp_cfg.plan_iter): self.tf_sess.run(self._tf_dict['mean']['planning_optimizer']) avg_cost = self.tf_sess.run( self._tf_dict['mean']['costs']).reshape([-1]) if self._print: logger.info('AFTER %d iter -> cost: %.3f' % (self._params.gbp_cfg.plan_iter, avg_cost)) sol = self.tf_sess.run(self._tf_dict['mean']['solution']).reshape( [-1]) elif self._gbp_type == 2: ''' @1 / 2: do the gradient based-planning with in the loop @1: do the planning for all the candidates @2: do the planning only for the top k candidates ''' mean, var, t = init_mean, init_var, 0 X = stats.truncnorm(-2, 2, loc=np.zeros_like(mean), scale=np.ones_like(mean)) while (t < self.max_iters) and np.max(var) > self.epsilon: lb_dist, ub_dist = mean - self.lb, self.ub - mean constrained_var = np.minimum( np.minimum(np.square(lb_dist / 2), np.square(ub_dist / 2)), var) samples = X.rvs(size=[self.popsize, self.sol_dim]) * \ np.sqrt(constrained_var) + mean self._tf_dict['popsize']['candidate_solutions'].load( samples.reshape([self.popsize, -1]), self.tf_sess) costs = self.tf_sess.run(self._tf_dict['popsize']['costs']) sort_id = np.argsort(costs) elites = samples[sort_id][:self.num_elites] # step 2: do gradient based planning for the top k candidates self._tf_dict['top_k']['candidate_solutions'].load( elites.reshape([self.num_elites, -1]), self.tf_sess) if self._print: logger.info('Init elites score -> cost: %f' % np.mean(costs[sort_id][:self.num_elites])) for gbp_iteration in range(self._params.gbp_cfg.plan_iter): self.tf_sess.run( self._tf_dict['top_k']['planning_optimizer']) if self._print: logger.info('AFTER %d iter -> cost: %f.' % (self._params.gbp_cfg.plan_iter, np.mean( self.tf_sess.run( self._tf_dict['top_k']['costs'])))) elites = self.tf_sess.run( self._tf_dict['top_k']['candidate_solutions']) new_mean = np.mean(elites, axis=0) new_var = np.var(elites, axis=0) mean = self.alpha * mean + (1 - self.alpha) * new_mean var = self.alpha * var + (1 - self.alpha) * new_var t += 1 sol, solvar = mean, var elif self._gbp_type == 1: ''' @1 / 2: do the gradient based-planning with in the loop @1: do the planning for all the candidates @2: do the planning only for the top k candidates ''' mean, var, t = init_mean, init_var, 0 X = stats.truncnorm(-2, 2, loc=np.zeros_like(mean), scale=np.ones_like(mean)) while (t < self.max_iters) and np.max(var) > self.epsilon: lb_dist, ub_dist = mean - self.lb, self.ub - mean constrained_var = np.minimum( np.minimum(np.square(lb_dist / 2), np.square(ub_dist / 2)), var) samples = X.rvs(size=[self.popsize, self.sol_dim]) * \ np.sqrt(constrained_var) + mean self._tf_dict['popsize']['candidate_solutions'].load( samples.reshape([self.popsize, -1]), self.tf_sess) costs = self.tf_sess.run(self._tf_dict['popsize']['costs']) sort_id = np.argsort(costs) old_elites = samples[sort_id][:self.num_elites] old_costs = costs[sort_id][:self.num_elites] if self._print: logger.info('Init elites score -> cost: %f' % np.mean(old_costs)) # step 2: do gradient based planning for gbp_iteration in range(self._params.gbp_cfg.plan_iter): self.tf_sess.run( self._tf_dict['popsize']['planning_optimizer']) samples, costs = self.tf_sess.run([ self._tf_dict['popsize']['candidate_solutions'], self._tf_dict['popsize']['costs'] ]) elites = np.concatenate([samples, old_elites], axis=0) costs = np.concatenate([costs, old_costs]) sort_id = np.argsort(costs) elites = elites[sort_id][:self.num_elites] costs = costs[sort_id][:self.num_elites] if self._print: logger.info( 'AFTER %d iter -> cost: %f.' % (self._params.gbp_cfg.plan_iter, np.mean(costs))) new_mean = np.mean(elites, axis=0) new_var = np.var(elites, axis=0) mean = self.alpha * mean + (1 - self.alpha) * new_mean var = self.alpha * var + (1 - self.alpha) * new_var t += 1 sol, solvar = mean, var else: assert False prev_sol = self.update_prev_sol(per, dU, sol) return sol, prev_sol
def run_experiment(self): """Perform experiment. """ os.makedirs(self.logdir, exist_ok=True) traj_obs, traj_acs, traj_rets, traj_rews = [], [], [], [] test_traj_obs, test_traj_acs, test_traj_rets = [], [], [] episode_iter_id = [] # Perform initial rollouts samples = [] needed_num_steps = self.ninit_rollouts * self.task_hor finished_num_steps = 0 """ # TODO DEBUG needed_num_steps = 64 self.task_hor = 64 """ while True: samples.append( self.agent.sample(self.task_hor, self.policy, self.delay_hor)) traj_obs.append(samples[-1]["obs"]) traj_acs.append(samples[-1]["ac"]) traj_rews.append(samples[-1]["rewards"]) finished_num_steps += len(samples[-1]["ac"]) print(finished_num_steps) if finished_num_steps >= needed_num_steps: break if self.ninit_rollouts > 0: self.policy.train([sample["obs"] for sample in samples], [sample["ac"] for sample in samples], [sample["rewards"] for sample in samples]) # Training loop for i in range(self.ntrain_iters): logger.info( "####################################################################" ) logger.info("Starting training iteration %d." % (i + 1)) iter_dir = os.path.join(self.logdir, "train_iter%d" % (i + 1)) os.makedirs(iter_dir, exist_ok=True) samples = [] assert self.nrecord == 0 needed_num_steps = self.task_hor * \ (max(self.neval, self.nrollouts_per_iter) - self.nrecord) finished_num_steps = 0 while True: samples.append( self.agent.sample(self.task_hor, self.policy, self.delay_hor)) finished_num_steps += len(samples[-1]["ac"]) if finished_num_steps >= needed_num_steps: break logger.info("Rewards obtained: {}".format( [sample["reward_sum"] for sample in samples[:self.neval]])) # test the policy if needed if self._params.misc.ctrl_cfg.cem_cfg.test_policy > 0: test_data = [] for _ in range(5): test_data.append( self.agent.sample(self.task_hor, self.policy, test_policy=True, average=False)) test_traj_rets.extend([ np.mean([ i_test_data["reward_sum"] for i_test_data in test_data ]) ]) test_traj_obs.extend( [i_test_data["obs"] for i_test_data in test_data]) test_traj_acs.extend( [i_test_data["ac"] for i_test_data in test_data]) traj_obs.extend([sample["obs"] for sample in samples]) traj_acs.extend([sample["ac"] for sample in samples]) traj_rets.extend([sample["reward_sum"] for sample in samples]) traj_rews.extend([sample["rewards"] for sample in samples]) episode_iter_id.extend([i] * len(samples)) samples = samples[:self.nrollouts_per_iter] self.policy.dump_logs(self.logdir, iter_dir) savemat( os.path.join(self.logdir, "logs.mat"), { "observations": traj_obs, "actions": traj_acs, "returns": traj_rets, "rewards": traj_rews, "test_returns": test_traj_rets, "test_obs": test_traj_obs, "test_acs": test_traj_acs, 'episode_iter_id': episode_iter_id }) # Delete iteration directory if not used if len(os.listdir(iter_dir)) == 0: os.rmdir(iter_dir) if i < self.ntrain_iters - 1: self.policy.train([sample["obs"] for sample in samples], [sample["ac"] for sample in samples], [sample["rewards"] for sample in samples])
def optimize_weights(self, data_dict, training_keys): self._set_whitening_var(data_dict['whitening_stats']) for i_epoch in range(self.args.discriminator_epochs): # step 1: generate the GAN noise, the training_ids data_dict['weight'] = \ generate_noise(data_dict['target_weight'], self.args.init_var) data_id = np.arange(len(data_dict['start_state'])) self._npr.shuffle(data_id) num_minibatch = max( len(data_id) // self.args.discriminator_minibatch_size, 1) recorder = { 'disc_loss': [], 'entropy': [], 'policy_loss': [], 'weight_decay': [], 'd_true_acc': [], 'd_fake_acc': [] } for start in range(num_minibatch): start_id = start * self.args.discriminator_minibatch_size end_id = min(start_id + self.args.discriminator_minibatch_size, len(data_dict['start_state'])) batch_inds = data_id[start_id:end_id] feed_dict = { self._input_ph[key]: data_dict[key][batch_inds] for key in training_keys } # step 2: optimize the discriminator disc_log = self._session.run( { 'disc_loss': self._update_operator['discriminator_loss'], 'entropy': self._update_operator['entropy_loss'], 'd_true_acc': self._update_operator['true_data_accuracy'], 'op': self._update_operator['disc_update_op'] }, feed_dict=feed_dict) # step 3: optimize the generator (train the policy network) policy_log = self._session.run( { 'policy_loss': self._update_operator['policy_loss'], 'weight_decay': self._update_operator['weight_decay_loss'], 'd_fake_acc': self._update_operator['fake_data_accuracy'], 'op': self._update_operator['policy_update_op'] }, feed_dict=feed_dict) policy_log.update(disc_log) for key in recorder: recorder[key].append(policy_log[key]) logger.info("GAN policy epoch: {}".format(i_epoch)) for key in recorder: logger.info("\t[loss] " + key + ": " + "%.6f" % np.mean(recorder[key])) # step 4: synchronize the target network self._set_weight(self._get_weight()) whitening_util.copy_whitening_var(data_dict['whitening_stats'], 'state', 'target_state') whitening_util.set_whitening_var(self._session, self._whitening_operator, data_dict['whitening_stats'], ['target_state'])
def sample(self, horizon, policy, record_fname=None, test_policy=False, average=False): """Samples a rollout from the agent. Arguments: horizon: (int) The length of the rollout to generate from the agent. policy: (policy) The policy that the agent will use for actions. record_fname: (str/None) The name of the file to which a recording of the rollout will be saved. If None, the rollout will not be recorded. Returns: (dict) A dictionary containing data from the rollout. The keys of the dictionary are 'obs', 'ac', and 'reward_sum'. """ if test_policy: logger.info('Testing the policy') video_record = record_fname is not None recorder = None if not video_record else VideoRecorder(self.env, record_fname) times, rewards = [], [] O, A, reward_sum, done = [self.env.reset()], [], 0, False self._debug += 1 policy.reset() # for t in range(20): for t in range(horizon): if hasattr(self.env, 'render_imitation'): self.env.render_imitation() if t % 50 == 10 and t > 1: logger.info('Current timesteps: %d / %d, average time: %.5f' % (t, horizon, np.mean(times))) if video_record: recorder.capture_frame() start = time.time() if test_policy: A.append(policy.act(O[t], t, test_policy=test_policy, average=average)) else: A.append(policy.act(O[t], t)) times.append(time.time() - start) if self.noise_stddev is None: obs, reward, done, info = self.env.step(A[t]) else: action = A[t] + np.random.normal(loc=0, scale=self.noise_stddev, size=[self.dU]) action = np.minimum(np.maximum(action, self.env.action_space.low), self.env.action_space.high) obs, reward, done, info = self.env.step(action) O.append(obs) reward_sum += reward rewards.append(reward) if done: break if video_record: recorder.capture_frame() recorder.close() logger.info("Average action selection time: %.4f" % np.mean(times)) logger.info("Rollout length: %d" % len(A)) return { "obs": np.array(O), "ac": np.array(A), "reward_sum": reward_sum, "rewards": np.array(rewards), }
def run_experiment(self): """Perform experiment. """ os.makedirs(self.logdir, exist_ok=True) traj_obs, traj_acs, traj_rets, traj_rews = [], [], [], [] test_traj_obs, test_traj_acs, test_traj_rets = [], [], [] episode_iter_id = [] # Perform initial rollouts samples = [] needed_num_steps = self.ninit_rollouts * self.task_hor finished_num_steps = 0 """ # TODO DEBUG needed_num_steps = 64 self.task_hor = 64 """ # logger.info("Collecting n_init rollout before policy trainning") while True: samples.append(self.agent.sample(self.task_hor, self.policy)) traj_obs.append(samples[-1]["obs"]) traj_acs.append(samples[-1]["ac"]) traj_rews.append(samples[-1]["rewards"]) finished_num_steps += len(samples[-1]["ac"]) if finished_num_steps >= needed_num_steps: break if self.ninit_rollouts > 0: # logger.info("Performing init policy trianing") self.policy.train([sample["obs"] for sample in samples], [sample["ac"] for sample in samples], [sample["rewards"] for sample in samples]) # Training loop for i in range(self.ntrain_iters): logger.info( "####################################################################" ) logger.info("Starting training iteration %d." % (i + 1)) iter_dir = os.path.join(self.logdir, "train_iter%d" % (i + 1)) os.makedirs(iter_dir, exist_ok=True) samples = [] assert self.nrecord == 0 needed_num_steps = self.task_hor * \ (max(self.neval, self.nrollouts_per_iter) - self.nrecord) finished_num_steps = 0 while True: samples.append(self.agent.sample(self.task_hor, self.policy)) finished_num_steps += len(samples[-1]["ac"]) if finished_num_steps >= needed_num_steps: break logger.info("Rewards obtained: {}".format( [sample["reward_sum"] for sample in samples[:self.neval]])) # test the policy if needed # comment out by ShenShuo # passing while config to misc is much too messy # we juse comment it out, if need testing policy, we consider a smarter way to pass # test_policy arg # if self._params.misc.ctrl_cfg.cem_cfg.test_policy > 0: # test_data = [] # for _ in range(5): # test_data.append( # self.agent.sample(self.task_hor, self.policy, # test_policy=True, average=False) # ) # test_traj_rets.extend([ # np.mean([i_test_data["reward_sum"] for i_test_data in test_data]) # ]) # test_traj_obs.extend( # [i_test_data["obs"] for i_test_data in test_data] # ) # test_traj_acs.extend( # [i_test_data["ac"] for i_test_data in test_data] # ) traj_obs.extend([sample["obs"] for sample in samples]) traj_acs.extend([sample["ac"] for sample in samples]) traj_rets.extend([sample["reward_sum"] for sample in samples]) traj_rews.extend([sample["rewards"] for sample in samples]) episode_iter_id.extend([i] * len(samples)) samples = samples[:self.nrollouts_per_iter] self.policy.dump_logs(self.logdir, iter_dir) savemat( os.path.join(self.logdir, "logs.mat"), { "observations": traj_obs, "actions": traj_acs, "returns": traj_rets, "rewards": traj_rews, "test_returns": test_traj_rets, "test_obs": test_traj_obs, "test_acs": test_traj_acs, 'episode_iter_id': episode_iter_id }) # Delete iteration directory if not used if len(os.listdir(iter_dir)) == 0: os.rmdir(iter_dir) # train policy and model together if i < self.ntrain_iters - 1: self.policy.train([sample["obs"] for sample in samples], [sample["ac"] for sample in samples], [sample["rewards"] for sample in samples]) if i % 10 == 0: self.log_model_predictions(i)