def train(self): self.start_worker() self.init_opt() for itr in range(self.current_itr, self.n_itr): with logger.prefix('itr #%d | ' % itr): paths = self.sampler.obtain_samples(itr) samples_data = self.sampler.process_samples(itr, paths) self.log_diagnostics(paths) self.optimize_policy(itr, samples_data) logger.log("saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) self.current_itr = itr + 1 params["algo"] = self if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("saved") logger.dump_tabular(with_prefix=False) if self.plot: self.update_plot() if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.shutdown_worker()
def worker_init_envs(G, alloc, scope, env): logger.log("initializing environment on worker %d" % G.worker_id) if not hasattr(G, 'parallel_vec_envs'): G.parallel_vec_envs = dict() G.parallel_vec_env_template = dict() G.parallel_vec_envs[scope] = [(idx, pickle.loads(pickle.dumps(env))) for idx in alloc] G.parallel_vec_env_template[scope] = env
def optimize(self, inputs, extra_inputs=None, callback=None): if len(inputs) == 0: # Assumes that we should always sample mini-batches raise NotImplementedError f_loss = self._opt_fun["f_loss"] if extra_inputs is None: extra_inputs = tuple() last_loss = f_loss(*(tuple(inputs) + extra_inputs)) start_time = time.time() # Overload self._batch size dataset = MAMLBatchDataset(inputs, num_batches=self._batch_size, extra_inputs=extra_inputs, meta_batch_size=self.meta_batch_size, num_grad_updates=self.num_grad_updates) sess = tf.get_default_session() for epoch in range(self._max_epochs): if self._verbose: logger.log("Epoch %d" % (epoch)) progbar = pyprind.ProgBar(len(inputs[0])) for j, batch in enumerate(dataset.iterate(update=True)): if self._init_train_op is not None: sess.run(self._init_train_op, dict(list(zip(self._input_vars, batch)))) self._init_train_op = None # only use it once else: if self.multi_adam > 1: sess.run(self._train_ops[epoch * self._batch_size + j], dict(list(zip(self._input_vars, batch)))) else: sess.run(self._train_op, dict(list(zip(self._input_vars, batch)))) if self._verbose: progbar.update(len(batch[0])) if self._verbose: if progbar.active: progbar.stop() new_loss = f_loss(*(tuple(inputs) + extra_inputs)) if self._verbose: logger.log("Epoch: %d | Loss: %f" % (epoch, new_loss)) if self._callback or callback: elapsed = time.time() - start_time callback_args = dict( loss=new_loss, params=self._target.get_param_values(trainable=True) if self._target else None, itr=epoch, elapsed=elapsed, ) if self._callback: self._callback(callback_args) if callback: callback(**callback_args) if abs(last_loss - new_loss) < self._tolerance: break last_loss = new_loss
def optimize_policy(self, itr, samples_data): all_input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages")) agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] all_input_values += tuple(state_info_list) + tuple(dist_info_list) if self.policy.recurrent: all_input_values += (samples_data["valids"], ) logger.log("Computing loss before") loss_before = self.optimizer.loss(all_input_values) logger.log("Computing KL before") mean_kl_before = self.optimizer.constraint_val(all_input_values) logger.log("Optimizing") self.optimizer.optimize(all_input_values) logger.log("Computing KL after") mean_kl = self.optimizer.constraint_val(all_input_values) logger.log("Computing loss after") loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('MeanKLBefore', mean_kl_before) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def __init__(self, env_name, record_video=True, video_schedule=None, log_dir=None, record_log=True, force_reset=False): if log_dir is None: if logger.get_snapshot_dir() is None: logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.") else: log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log") Serializable.quick_init(self, locals()) env = gym.envs.make(env_name) self.env = env self.env_id = env.spec.id monitor_manager.logger.setLevel(logging.WARNING) assert not (not record_log and record_video) if log_dir is None or record_log is False: self.monitoring = False else: if not record_video: video_schedule = NoVideoSchedule() else: if video_schedule is None: video_schedule = CappedCubicVideoSchedule() self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True) self.monitoring = True self._observation_space = convert_gym_space(env.observation_space) self._action_space = convert_gym_space(env.action_space) self._horizon = env.spec.timestep_limit self._log_dir = log_dir self._force_reset = force_reset
def optimize_gen(self, inputs, extra_inputs=None, callback=None, yield_itr=None): if len(inputs) == 0: # Assumes that we should always sample mini-batches raise NotImplementedError f_opt = self._opt_fun["f_opt"] f_loss = self._opt_fun["f_loss"] if extra_inputs is None: extra_inputs = tuple() last_loss = f_loss(*(tuple(inputs) + extra_inputs)) start_time = time.time() dataset = BatchDataset(inputs, self._batch_size, extra_inputs=extra_inputs #, randomized=self._randomized ) itr = 0 for epoch in pyprind.prog_bar(list(range(self._max_epochs))): for batch in dataset.iterate(update=True): f_opt(*batch) if yield_itr is not None and (itr % (yield_itr + 1)) == 0: yield itr += 1 new_loss = f_loss(*(tuple(inputs) + extra_inputs)) if self._verbose: logger.log("Epoch %d, loss %s" % (epoch, new_loss)) if self._callback or callback: elapsed = time.time() - start_time callback_args = dict( loss=new_loss, params=self._target.get_param_values( trainable=True) if self._target else None, itr=epoch, elapsed=elapsed, ) if self._callback: self._callback(callback_args) if callback: callback(**callback_args) if abs(last_loss - new_loss) < self._tolerance: break last_loss = new_loss
def optimize_policy(self, itr, all_samples_data): assert len( all_samples_data ) == self.num_grad_updates + 1 # we collected the rollouts to compute the grads and then the test! if not self.use_maml: all_samples_data = [all_samples_data[0]] input_list = [] for step in range( len(all_samples_data)): # these are the gradient steps obs_list, action_list, adv_list = [], [], [] for i in range(self.meta_batch_size): inputs = ext.extract(all_samples_data[step][i], "observations", "actions", "advantages") obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) input_list += obs_list + action_list + adv_list # [ [obs_0], [act_0], [adv_0], [obs_1], ... ] if step == 0: ##CF not used? init_inputs = input_list if self.use_maml: dist_info_list = [] for i in range(self.meta_batch_size): agent_infos = all_samples_data[ self.kl_constrain_step][i]['agent_infos'] dist_info_list += [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] input_list += tuple(dist_info_list) logger.log("Computing KL before") mean_kl_before = self.optimizer.constraint_val(input_list) logger.log("Computing loss before") loss_before = self.optimizer.loss(input_list) logger.log("Optimizing") self.optimizer.optimize(input_list) logger.log("Computing loss after") loss_after = self.optimizer.loss(input_list) if self.use_maml: logger.log("Computing KL after") mean_kl = self.optimizer.constraint_val(input_list) logger.record_tabular('MeanKLBefore', mean_kl_before) # this now won't be 0! logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def populate_task(env, policy, scope=None): logger.log("Populating workers...") if singleton_pool.n_parallel > 1: singleton_pool.run_each( _worker_populate_task, [(pickle.dumps(env), pickle.dumps(policy), scope)] * singleton_pool.n_parallel ) else: # avoid unnecessary copying G = _get_scoped_G(singleton_pool.G, scope) G.env = env G.policy = policy logger.log("Populated")
def worker_run_reset(G, flags, scope): if not hasattr(G, 'parallel_vec_envs'): logger.log("on worker %d" % G.worker_id) import traceback for line in traceback.format_stack(): logger.log(line) # log the stacktrace at least logger.log("oops") for k, v in G.__dict__.items(): logger.log(str(k) + " : " + str(v)) assert hasattr(G, 'parallel_vec_envs') assert scope in G.parallel_vec_envs N = len(G.parallel_vec_envs[scope]) env_template = G.parallel_vec_env_template[scope] obs_dim = env_template.observation_space.flat_dim ret_arr = np.zeros((N, obs_dim)) ids = [] flat_obs = [] reset_ids = [] for itr_idx, (idx, env) in enumerate(G.parallel_vec_envs[scope]): flag = flags[idx] if flag: flat_obs.append(env.reset()) reset_ids.append(itr_idx) ids.append(idx) if len(reset_ids) > 0: ret_arr[reset_ids] = env_template.observation_space.flatten_n(flat_obs) return ids, ret_arr
def optimize_policy(self, itr, samples_data): logger.log("optimizing policy") inputs = ext.extract(samples_data, "observations", "actions", "advantages") agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] inputs += tuple(state_info_list) if self.policy.recurrent: inputs += (samples_data["valids"], ) dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] loss_before = self.optimizer.loss(inputs) self.optimizer.optimize(inputs) loss_after = self.optimizer.loss(inputs) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) + dist_info_list)) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('MaxKL', max_kl)
def optimize_policy(self, itr, all_samples_data): logger.log("optimizing policy") assert len(all_samples_data) == self.num_grad_updates + 1 if not self.use_maml: all_samples_data = [all_samples_data[0]] input_list = [] for step in range(len(all_samples_data)): obs_list, action_list, adv_list = [], [], [] for i in range(self.meta_batch_size): inputs = ext.extract(all_samples_data[step][i], "observations", "actions", "advantages") obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) input_list += obs_list + action_list + adv_list if step == 0: init_inputs = input_list loss_before = self.optimizer.loss(input_list) self.optimizer.optimize(input_list) loss_after = self.optimizer.loss(input_list) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) dist_info_list = [] for i in range(self.meta_batch_size): agent_infos = all_samples_data[-1][i]['agent_infos'] dist_info_list += [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] if self.use_maml: mean_kl, max_kl = self.opt_info['f_kl'](*(list(input_list) + dist_info_list)) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('MaxKL', max_kl)
def evaluate(self, epoch, pool): logger.log("Collecting samples for evaluation") paths = parallel_sampler.sample_paths( policy_params=self.policy.get_param_values(), max_samples=self.eval_samples, max_path_length=self.max_path_length, ) average_discounted_return = np.mean([ special.discount_return(path["rewards"], self.discount) for path in paths ]) returns = [sum(path["rewards"]) for path in paths] all_qs = np.concatenate(self.q_averages) all_ys = np.concatenate(self.y_averages) average_q_loss = np.mean(self.qf_loss_averages) average_policy_surr = np.mean(self.policy_surr_averages) average_action = np.mean( np.square(np.concatenate([path["actions"] for path in paths]))) policy_reg_param_norm = np.linalg.norm( self.policy.get_param_values(regularizable=True)) qfun_reg_param_norm = np.linalg.norm( self.qf.get_param_values(regularizable=True)) logger.record_tabular('Epoch', epoch) logger.record_tabular('AverageReturn', np.mean(returns)) logger.record_tabular('StdReturn', np.std(returns)) logger.record_tabular('MaxReturn', np.max(returns)) logger.record_tabular('MinReturn', np.min(returns)) if len(self.es_path_returns) > 0: logger.record_tabular('AverageEsReturn', np.mean(self.es_path_returns)) logger.record_tabular('StdEsReturn', np.std(self.es_path_returns)) logger.record_tabular('MaxEsReturn', np.max(self.es_path_returns)) logger.record_tabular('MinEsReturn', np.min(self.es_path_returns)) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageQLoss', average_q_loss) logger.record_tabular('AveragePolicySurr', average_policy_surr) logger.record_tabular('AverageQ', np.mean(all_qs)) logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs))) logger.record_tabular('AverageY', np.mean(all_ys)) logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys))) logger.record_tabular('AverageAbsQYDiff', np.mean(np.abs(all_qs - all_ys))) logger.record_tabular('AverageAction', average_action) logger.record_tabular('PolicyRegParamNorm', policy_reg_param_norm) logger.record_tabular('QFunRegParamNorm', qfun_reg_param_norm) self.env.log_diagnostics(paths) self.policy.log_diagnostics(paths) self.qf_loss_averages = [] self.policy_surr_averages = [] self.q_averages = [] self.y_averages = [] self.es_path_returns = []
def train(self): # This seems like a rather sequential method pool = SimpleReplayPool( max_pool_size=self.replay_pool_size, observation_dim=self.env.observation_space.flat_dim, action_dim=self.env.action_space.flat_dim, ) self.start_worker() self.init_opt() itr = 0 path_length = 0 path_return = 0 terminal = False observation = self.env.reset() sample_policy = pickle.loads(pickle.dumps(self.policy)) for epoch in range(self.n_epochs): logger.push_prefix('epoch #%d | ' % epoch) logger.log("Training started") for epoch_itr in pyprind.prog_bar(range(self.epoch_length)): # Execute policy if terminal: # or path_length > self.max_path_length: # Note that if the last time step ends an episode, the very # last state and observation will be ignored and not added # to the replay pool observation = self.env.reset() self.es.reset() sample_policy.reset() self.es_path_returns.append(path_return) path_length = 0 path_return = 0 action = self.es.get_action(itr, observation, policy=sample_policy) # qf=qf) next_observation, reward, terminal, _ = self.env.step(action) path_length += 1 path_return += reward if not terminal and path_length >= self.max_path_length: terminal = True # only include the terminal transition in this case if the flag was set if self.include_horizon_terminal_transitions: pool.add_sample(observation, action, reward * self.scale_reward, terminal) else: pool.add_sample(observation, action, reward * self.scale_reward, terminal) observation = next_observation if pool.size >= self.min_pool_size: for update_itr in range(self.n_updates_per_sample): # Train policy batch = pool.random_batch(self.batch_size) self.do_training(itr, batch) sample_policy.set_param_values( self.policy.get_param_values()) itr += 1 logger.log("Training finished") if pool.size >= self.min_pool_size: self.evaluate(epoch, pool) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) logger.dump_tabular(with_prefix=False) logger.pop_prefix() if self.plot: self.update_plot() if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.env.terminate() self.policy.terminate()
def optimize_policy(self, itr, samples_data): # Init vars rewards = samples_data['rewards'] actions = samples_data['actions'] observations = samples_data['observations'] agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] if self.policy.recurrent: recurrent_vals = [samples_data["valids"]] else: recurrent_vals = [] # Compute sample Bellman error. feat_diff = [] for path in samples_data['paths']: feats = self._features(path) feats = np.vstack([feats, np.zeros(feats.shape[1])]) feat_diff.append(feats[1:] - feats[:-1]) if self.policy.recurrent: max_path_length = max( [len(path["advantages"]) for path in samples_data["paths"]]) # pad feature diffs feat_diff = np.array([ tensor_utils.pad_tensor(fd, max_path_length) for fd in feat_diff ]) else: feat_diff = np.vstack(feat_diff) ################# # Optimize dual # ################# # Here we need to optimize dual through BFGS in order to obtain \eta # value. Initialize dual function g(\theta, v). \eta > 0 # First eval delta_v f_dual = self.opt_info['f_dual'] f_dual_grad = self.opt_info['f_dual_grad'] # Set BFGS eval function def eval_dual(input): param_eta = input[0] param_v = input[1:] val = f_dual(*([rewards, feat_diff] + state_info_list + recurrent_vals + [param_eta, param_v])) return val.astype(np.float64) # Set BFGS gradient eval function def eval_dual_grad(input): param_eta = input[0] param_v = input[1:] grad = f_dual_grad(*([rewards, feat_diff] + state_info_list + recurrent_vals + [param_eta, param_v])) eta_grad = np.float(grad[0]) v_grad = grad[1] return np.hstack([eta_grad, v_grad]) # Initial BFGS parameter values. x0 = np.hstack([self.param_eta, self.param_v]) # Set parameter boundaries: \eta>0, v unrestricted. bounds = [(-np.inf, np.inf) for _ in x0] bounds[0] = (0., np.inf) # Optimize through BFGS logger.log('optimizing dual') eta_before = x0[0] dual_before = eval_dual(x0) params_ast, _, _ = self.optimizer(func=eval_dual, x0=x0, fprime=eval_dual_grad, bounds=bounds, maxiter=self.max_opt_itr, disp=0) dual_after = eval_dual(params_ast) # Optimal values have been obtained self.param_eta = params_ast[0] self.param_v = params_ast[1:] ################### # Optimize policy # ################### cur_params = self.policy.get_param_values(trainable=True) f_loss = self.opt_info["f_loss"] f_loss_grad = self.opt_info['f_loss_grad'] input = [ rewards, observations, feat_diff, actions ] + state_info_list + recurrent_vals + [self.param_eta, self.param_v] # Set loss eval function def eval_loss(params): self.policy.set_param_values(params, trainable=True) val = f_loss(*input) return val.astype(np.float64) # Set loss gradient eval function def eval_loss_grad(params): self.policy.set_param_values(params, trainable=True) grad = f_loss_grad(*input) flattened_grad = tensor_utils.flatten_tensors( list(map(np.asarray, grad))) return flattened_grad.astype(np.float64) loss_before = eval_loss(cur_params) logger.log('optimizing policy') params_ast, _, _ = self.optimizer(func=eval_loss, x0=cur_params, fprime=eval_loss_grad, disp=0, maxiter=self.max_opt_itr) loss_after = eval_loss(params_ast) f_kl = self.opt_info['f_kl'] mean_kl = f_kl(*([observations, actions] + state_info_list + dist_info_list + recurrent_vals)).astype(np.float64) logger.log('eta %f -> %f' % (eta_before, self.param_eta)) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) logger.record_tabular('DualBefore', dual_before) logger.record_tabular('DualAfter', dual_after) logger.record_tabular('MeanKL', mean_kl)
def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''): # reset_args: arguments to pass to the environments to reset # return_dict: whether or not to return a dictionary or list form of paths logger.log("Obtaining samples for iteration %d..." % itr) #paths = [] paths = {} for i in range(self.vec_env.num_envs): paths[i] = [] # if the reset args are not list/numpy, we set the same args for each env if reset_args is not None and (type(reset_args) != list and type(reset_args)!=np.ndarray): reset_args = [reset_args]*self.vec_env.num_envs n_samples = 0 obses = self.vec_env.reset(reset_args) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths[idx].append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular(log_prefix+"PolicyExecTime", policy_time) logger.record_tabular(log_prefix+"EnvExecTime", env_time) logger.record_tabular(log_prefix+"ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [item for sublist in l for item in sublist] paths = flatten_list(paths.values()) #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) return paths
def optimize(self, inputs): inputs = tuple(inputs) try_penalty = np.clip( self._penalty, self._min_penalty, self._max_penalty) penalty_scale_factor = None f_opt = self._opt_fun["f_opt"] f_penalized_loss = self._opt_fun["f_penalized_loss"] def gen_f_opt(penalty): def f(flat_params): self._target.set_param_values(flat_params, trainable=True) return f_opt(*(inputs + (penalty,))) return f cur_params = self._target.get_param_values(trainable=True).astype('float64') opt_params = cur_params for penalty_itr in range(self._max_penalty_itr): logger.log('trying penalty=%.3f...' % try_penalty) itr_opt_params, _, _ = scipy.optimize.fmin_l_bfgs_b( func=gen_f_opt(try_penalty), x0=cur_params, maxiter=self._max_opt_itr ) _, try_loss, try_constraint_val = f_penalized_loss(*(inputs + (try_penalty,))) logger.log('penalty %f => loss %f, %s %f' % (try_penalty, try_loss, self._constraint_name, try_constraint_val)) # Either constraint satisfied, or we are at the last iteration already and no alternative parameter # satisfies the constraint if try_constraint_val < self._max_constraint_val or \ (penalty_itr == self._max_penalty_itr - 1 and opt_params is None): opt_params = itr_opt_params if not self._adapt_penalty: break # Decide scale factor on the first iteration, or if constraint violation yields numerical error if penalty_scale_factor is None or np.isnan(try_constraint_val): # Increase penalty if constraint violated, or if constraint term is NAN if try_constraint_val > self._max_constraint_val or np.isnan(try_constraint_val): penalty_scale_factor = self._increase_penalty_factor else: # Otherwise (i.e. constraint satisfied), shrink penalty penalty_scale_factor = self._decrease_penalty_factor opt_params = itr_opt_params else: if penalty_scale_factor > 1 and \ try_constraint_val <= self._max_constraint_val: break elif penalty_scale_factor < 1 and \ try_constraint_val >= self._max_constraint_val: break try_penalty *= penalty_scale_factor try_penalty = np.clip(try_penalty, self._min_penalty, self._max_penalty) self._penalty = try_penalty self._target.set_param_values(opt_params, trainable=True)
filename = str(uuid.uuid4()) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('file', type=str, help='path to the snapshot file') parser.add_argument('--log_dir', type=str, default=None, help='path to the new log directory') # Look for params.json file args = parser.parse_args() parent_dir = os.path.dirname(os.path.realpath(args.file)) json_file_path = os.path.join(parent_dir, "params.json") logger.log("Looking for params.json at %s..." % json_file_path) try: with open(json_file_path, "r") as f: params = json.load(f) # exclude certain parameters excluded = ['json_args'] for k in excluded: if k in params: del params[k] for k, v in list(params.items()): if v is None: del params[k] if args.log_dir is not None: params['log_dir'] = args.log_dir params['resume_from'] = args.file command = to_local_command(params,
def train(self): with tf.Session() as sess: if self.load_policy is not None: import joblib self.policy = joblib.load(self.load_policy)['policy'] self.init_opt() # initialize uninitialized vars (I know, it's ugly) uninit_vars = [] for var in tf.global_variables(): try: sess.run(var) except tf.errors.FailedPreconditionError: uninit_vars.append(var) sess.run(tf.variables_initializer(uninit_vars)) #sess.run(tf.initialize_all_variables()) self.start_worker() start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining samples...") paths = self.obtain_samples(itr) logger.log("Processing samples...") samples_data = self.process_samples(itr, paths) logger.log("Logging diagnostics...") self.log_diagnostics(paths) logger.log("Optimizing policy...") self.optimize_policy(itr, samples_data) #new_param_values = self.policy.get_variable_values(self.policy.all_params) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) # , **kwargs) if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) #import pickle #with open('paths_itr'+str(itr)+'.pkl', 'wb') as f: # pickle.dump(paths, f) # debugging """ if itr % 1 == 0: logger.log("Saving visualization of paths") import matplotlib.pyplot as plt; for ind in range(5): plt.clf(); plt.hold(True) points = paths[ind]['observations'] plt.plot(points[:,0], points[:,1], '-r', linewidth=2) plt.xlim([-1.0, 1.0]) plt.ylim([-1.0, 1.0]) plt.legend(['path']) plt.savefig('/home/cfinn/path'+str(ind)+'.png') """ # end debugging logger.dump_tabular(with_prefix=False) if self.plot: self.update_plot() if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.shutdown_worker()
def train(self): # TODO - make this a util flatten_list = lambda l: [item for sublist in l for item in sublist] with tf.Session() as sess: # Code for loading a previous policy. Somewhat hacky because needs to be in sess. if self.load_policy is not None: import joblib self.policy = joblib.load(self.load_policy)['policy'] self.init_opt() # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [] for var in tf.global_variables(): # note - this is hacky, may be better way to do this in newer TF. try: sess.run(var) except tf.errors.FailedPreconditionError: uninit_vars.append(var) sess.run(tf.variables_initializer(uninit_vars)) self.start_worker() start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Sampling set of tasks/goals for this meta-batch...") # sample environment configuration env = self.env while not ('sample_env_params' in dir(env) or 'sample_goals' in dir(env)): env = env._wrapped_env if 'sample_goals' in dir(env): learner_env_params = env.sample_goals(self.meta_batch_size) elif 'sample_env_params': learner_env_params = env.sample_env_params(self.meta_batch_size) self.policy.switch_to_init_dist() # Switch to pre-update policy all_samples_data, all_paths = [], [] for step in range(self.num_grad_updates+1): #if step > 0: # import pdb; pdb.set_trace() # test param_vals functions. logger.log('** Step ' + str(step) + ' **') logger.log("Obtaining samples...") paths = self.obtain_samples(itr, reset_args=learner_env_params, log_prefix=str(step)) all_paths.append(paths) logger.log("Processing samples...") samples_data = {} for key in paths.keys(): # the keys are the tasks # don't log because this will spam the consol with every task. samples_data[key] = self.process_samples(itr, paths[key], log=False) all_samples_data.append(samples_data) # for logging purposes self.process_samples(itr, flatten_list(paths.values()), prefix=str(step), log=True) logger.log("Logging diagnostics...") self.log_diagnostics(flatten_list(paths.values()), prefix=str(step)) if step < self.num_grad_updates: logger.log("Computing policy updates...") self.policy.compute_updated_dists(samples_data) logger.log("Optimizing policy...") # This needs to take all samples_data so that it can construct graph for meta-optimization. self.optimize_policy(itr, all_samples_data) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, all_samples_data[-1]) # , **kwargs) if self.store_paths: params["paths"] = all_samples_data[-1]["paths"] logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False) # The rest is some example plotting code. # Plotting code is useful for visualizing trajectories across a few different tasks. if False and itr % 2 == 0 and self.env.observation_space.shape[0] <= 4: # point-mass logger.log("Saving visualization of paths") for ind in range(min(5, self.meta_batch_size)): plt.clf() plt.plot(learner_env_params[ind][0], learner_env_params[ind][1], 'k*', markersize=10) plt.hold(True) preupdate_paths = all_paths[0] postupdate_paths = all_paths[-1] pre_points = preupdate_paths[ind][0]['observations'] post_points = postupdate_paths[ind][0]['observations'] plt.plot(pre_points[:,0], pre_points[:,1], '-r', linewidth=2) plt.plot(post_points[:,0], post_points[:,1], '-b', linewidth=1) pre_points = preupdate_paths[ind][1]['observations'] post_points = postupdate_paths[ind][1]['observations'] plt.plot(pre_points[:,0], pre_points[:,1], '--r', linewidth=2) plt.plot(post_points[:,0], post_points[:,1], '--b', linewidth=1) pre_points = preupdate_paths[ind][2]['observations'] post_points = postupdate_paths[ind][2]['observations'] plt.plot(pre_points[:,0], pre_points[:,1], '-.r', linewidth=2) plt.plot(post_points[:,0], post_points[:,1], '-.b', linewidth=1) plt.plot(0,0, 'k.', markersize=5) plt.xlim([-0.8, 0.8]) plt.ylim([-0.8, 0.8]) plt.legend(['goal', 'preupdate path', 'postupdate path']) plt.savefig(osp.join(logger.get_snapshot_dir(), 'prepost_path'+str(ind)+'.png')) elif False and itr % 2 == 0: # swimmer or cheetah logger.log("Saving visualization of paths") for ind in range(min(5, self.meta_batch_size)): plt.clf() goal_vel = learner_env_params[ind] plt.title('Swimmer paths, goal vel='+str(goal_vel)) plt.hold(True) prepathobs = all_paths[0][ind][0]['observations'] postpathobs = all_paths[-1][ind][0]['observations'] plt.plot(prepathobs[:,0], prepathobs[:,1], '-r', linewidth=2) plt.plot(postpathobs[:,0], postpathobs[:,1], '--b', linewidth=1) plt.plot(prepathobs[-1,0], prepathobs[-1,1], 'r*', markersize=10) plt.plot(postpathobs[-1,0], postpathobs[-1,1], 'b*', markersize=10) plt.xlim([-1.0, 5.0]) plt.ylim([-1.0, 1.0]) plt.legend(['preupdate path', 'postupdate path'], loc=2) plt.savefig(osp.join(logger.get_snapshot_dir(), 'swim1d_prepost_itr'+str(itr)+'_id'+str(ind)+'.pdf')) self.shutdown_worker()
def _worker_set_seed(_, seed): logger.log("Setting seed to %d" % seed) ext.set_seed(seed)