def __init__(self, name, problem_server): self.name = name # need a handle to problem server so that it doesn't get GC'd (which # would kill the child process!) self.problem_server = problem_server self.problem_service = problem_server.service self.prob_meta, self.dom_meta = to_local( self.problem_service.get_meta()) self.env_spec = to_local(self.problem_service.get_env_spec()) self.dg_extra_dim = to_local(self.problem_service.get_dg_extra_dim()) # will get filled in later self.policy = None
def exposed_batch_iter(self, batch_size, n_batches): """Sample <batch_size> elements from internal buffer.""" batch_size = to_local(batch_size) n_batches = to_local(n_batches) # first convert replay buffer to a list so that we can shuffle and # take indices assert len(self.replay) > 0, 'need non-empty replay pool' ordered_buf = list(self.replay) shuffle(ordered_buf) # in-place gen = cycle(ordered_buf) for batch_num in range(n_batches): rich_batch = list(islice(gen, batch_size)) yield self.flatten_batch(rich_batch)
def _extend_replays(self, num_per_problem: int): """Extend the replays for //all// problems asynchronously.""" # fire off extension methods results = [] for problem in tqdm.tqdm(self.problems, desc='spawn extend'): get_action = self._make_get_action(problem) extend_replay = rpyc. async (problem.problem_service.extend_replay) result = extend_replay(get_action, num_per_problem) # apparently I need to keep hold of async ref according to RPyC # docs (it's weak or s.th). Also, I need a background thread to # serve each environment's requests (...this may break things # slightly). bg_thread = rpyc.utils.helpers.BgServingThread( problem.problem_server.conn) results.append((extend_replay, result, bg_thread)) # Now we wait for results to come back. This is horribly inefficient # when some environments are much harder than others; oh well. succ_rates = [] for _, result, bg_thread in tqdm.tqdm(results, desc='wait extend'): succ_rates.append(to_local(result.value)) # always shut down cleanly bg_thread.stop() return succ_rates
def __init__(self, problems: List['fpg.SingleProblem'], weight_manager: PropNetworkWeights, summary_writer: Any, strategy: SupervisedObjective, kl_coeff: Union[None, float], batch_size: int = 64, lr: float = 0.001) -> None: # this needs to be acquired before figuring out an action from NN self._get_act_lock = threading.RLock() # gets incremented to deal with TF self.batches_seen = 0 self.problems = problems self.weight_manager = weight_manager self.summary_writer = summary_writer self.batch_size = batch_size self.batch_size_per_problem = max(batch_size // len(problems), 1) self.strategy = strategy self.kl_coeff = kl_coeff self.max_len = max( to_local(problem.problem_service.get_max_len()) for problem in self.problems) self.tf_init_done = False self.lr = lr self._init_tf()
def get_problem_names(pddl_files): """Return a list of problem names from some PDDL files by spooling up background process.""" config = ProblemServiceConfig(pddl_files, None) server = ProblemServer(config) try: names = to_local(server.service.get_problem_names()) assert isinstance(names, list) assert all(isinstance(name, str) for name in names) finally: server.stop() return names
def eval_single(args, policy, problem_server, unique_prefix, elapsed_time, iter_num, weight_manager, scratch_dir): # now we evaluate the learned policy print('Evaluating policy') trial_results, paths = run_trials(policy, problem_server, args.rounds_eval, limit=args.limit_turns, det_sample=args.det_eval, show_time=args.show_eval_time) print('Trial results:') print('\n'.join('%s: %s' % (k, v) for k, v in trial_results.items())) out_dict = { 'no_train': args.no_train, 'args_problems': args.problems, 'problem': to_local(problem_server.service.get_current_problem_name()), 'timeout': args.timeout, 'optimiser': args.optimiser.optimiser_name, 'model': args.model, 'model_opts': args.model_opts, 'all_args': sys.argv[1:], # TODO: possibly add this. Not sure whether it's worthwhile given # that the supposed "convergence" measure might be spurious (e.g. # what if it just spikes up in reward briefly?). # convergence_* refers to first iteration at which best score was # encountere # 'convergence_time': convergence_time, # 'convergence_iters': convergence_iter, # elapsed_* also includes time/iterations spent looking for better # results after converging 'elapsed_opt_time': elapsed_time, 'elapsed_opt_iters': iter_num, 'trial_paths': paths } out_dict.update(trial_results) result_path = path.join(scratch_dir, 'results.json') with open(result_path, 'w') as fp: dump(out_dict, fp, indent=2) # also write out lists of actions taken during final trial # TODO: should also write out some randomly chosen paths during training # TODO: also write out probabilities of each action for at least some paths # (or some states), and maybe even real Q-values of actions (would be # helpful!) actions_path = path.join(scratch_dir, 'trial-paths.txt') with open(actions_path, 'w') as fp: for alist in paths: fp.write(' -> '.join(alist)) fp.write('\n\n')
def _make_batches(self, n_batches: int) -> Iterable[Dict[Any, Any]]: """Make a given number of batches for each problem.""" batch_iters = [] for problem in self.problems: service = problem.problem_service it = service.batch_iter(self.batch_size_per_problem, n_batches) batch_iters.append(it) combined = zip(*batch_iters) # yield a complete feed dict for combined_batch in combined: assert len(combined_batch) == len(self.problems) yield_val = {} for problem, batch in zip(self.problems, combined_batch): ph_obs_var, ph_q_values = self.obs_qv_inputs[problem.name] obs_tensor, qv_tensor = to_local(batch) yield_val[ph_obs_var] = obs_tensor yield_val[ph_q_values] = qv_tensor yield yield_val
def inner(obs): obs = to_local(obs) try: # if this times out then something really screwy is going on self._get_act_lock.acquire(timeout=60 * 30) # each thread needs to have this call somewhere, per # https://www.tensorflow.org/versions/r0.12/api_docs/python/client/session_management with self.sess.as_default(): # make sure it's 1D (need different strategy for batch # cache) assert obs.ndim == 1 obs_bytes = obs.tostring() if obs_bytes not in cache: cache[obs_bytes] = get_action(obs) return cache[obs_bytes] return cache[obs_bytes] finally: self._get_act_lock.release()
def _instantiate_net(self, single_prob_instance: 'fpg.SingleProblem'): # create two placeholders problem_service = single_prob_instance.problem_service policy = single_prob_instance.policy obs_dim = to_local(problem_service.get_obs_dim()) obs_dtype_name = to_local(problem_service.get_obs_dtype_name()) ph_obs_var = tf.placeholder(shape=[None, obs_dim], name='observation', dtype=obs_dtype_name) act_dist = policy.dist_info_sym(ph_obs_var, summary_collections=['sl-activations' ])['prob'] act_dim = to_local(problem_service.get_act_dim()) ph_q_values = tf.placeholder(shape=[None, act_dim], name='q_values', dtype='float32') loss_parts = [] # now the loss ops if self.strategy == SupervisedObjective.ANY_GOOD_ACTION: best_qv = tf.reduce_min(ph_q_values, axis=-1, keep_dims=True) # TODO: is 0.01 threshold too big? Hmm. act_labels = tf.cast(tf.less(tf.abs(ph_q_values - best_qv), 0.01), 'float32') # act_labels = tf.cast(tf.equal(ph_q_values, best_qv), 'float32') label_sum = tf.reduce_sum(act_labels, axis=-1, keep_dims=True) act_label_dist = act_labels / label_sum # zero out disabled or dead-end actions! dead_end_value = to_local( problem_service.get_ssipp_dead_end_value()) act_label_dist *= tf.cast(act_labels <= dead_end_value, 'float32') # XXX: this will obviously break if we have softmax; it'll spend # heaps of time trying to get all labels to be equal, and still # have (nonsense) nonzero loss afterwards :( xent = tf.reduce_mean(cross_entropy(act_dist, act_label_dist)) loss_parts.append(('xent', xent)) elif self.strategy == SupervisedObjective.MAX_ADVANTAGE: state_values = tf.reduce_min(ph_q_values, axis=-1) # is_nonzero = tf.greater(act_dist, 1e-4) # act_dist_nz = tf.where(is_nonzero, act_dist, # tf.ones_like(act_dist)) # exp_q = act_dist_nz * (ph_q_values - state_values) exp_q = act_dist * ph_q_values exp_vs = tf.reduce_sum(exp_q, axis=-1) # state value is irrelevant to objective, but is included because # it ensures that zero loss = optimal policy q_loss = tf.reduce_mean(exp_vs - state_values) loss_parts.append(('qloss', q_loss)) # XXX: need to look at whatever this is (and fix it if it's wrong) # if self.kl_coeff: # assert self.kl_coeff > 0, \ # "negative entropy coefficient must be positive if supplied" # is_nonzero = tf.equal(act_dist, 0.0) # num_enabled = tf.reduce_sum( # tf.cast(is_nonzero, tf.float32), axis=1) # # clip so that really tiny values don't make our loss balloon! # act_dist_clip = tf.clip_by_value(act_dist, 1e-10, 1.0) # # also change all the zero values to ones, so that they count # # as zero in summation below # act_dist_clamp = tf.where(is_nonzero, act_dist_clip, # tf.ones_like(act_dist)) # xent = -tf.reduce_sum( # tf.log(act_dist_clamp), axis=1) / num_enabled # kl_div = -tf.log(num_enabled) + xent # scale_kl_div = self.kl_coeff * tf.reduce_mean(kl_div) # loss_parts.append(('scale-kld', scale_kl_div)) # # batch_neg_entropy = tf.reduce_sum( # act_dist * tf.log(act_dist_clamp), axis=-1) # # we allow drift of this many bits from uniform; otherwise, # # apply entropy loss! # num_enabled = tf.reduce_sum( # tf.cast(act_dist > 1e-10, tf.float32), axis=1) # allowed_bits = num_enabled - 1.5 # uniform_bits = tf.log(num_enabled) / tf.log(2.0) # min_neg_entropy = -uniform_bits + allowed_bits # batch_neg_ent_clip = tf.clip_by_value(batch_neg_entropy, # min_neg_entropy, 0) # batch_neg_ent_clip += min_neg_entropy # # we want to maximise entropy, kinda # ent_reg = self.neg_ent_coeff * tf.reduce_mean( # batch_neg_ent_clip) # loss_parts.append(('entreg', ent_reg)) else: raise ValueError("Unknown strategy %s" % self.strategy) # regularisation # TODO: make this configurable! weights = self.weight_manager.all_weights l2_reg = 0.001 * sum(tf.nn.l2_loss(w) for w in weights) loss_parts.append(('l2reg', l2_reg)) loss = sum(p[1] for p in loss_parts) return ph_obs_var, ph_q_values, loss, loss_parts
def _get_replay_sizes(self) -> List[int]: """Get the sizes of replay buffers for each problem.""" rv = [] for problem in self.problems: rv.append(to_local(problem.problem_service.get_replay_size())) return rv
def exposed_env_step(self, action): action = to_local(action) return self.env_wrapped.step(action)
def exposed_action_name(self, action_num): action_num = to_local(action_num) return self.env_raw.action_name(action_num)
def exposed_extend_replay(self, get_action, n_paths): """Extend the replay buffer using the given policy (represented as a function from flattened observation vectors to action numbenrs).""" n_paths = to_local(n_paths) return self.internal_extend_replay(get_action, n_paths)
def reset(self): remote_obs = self._problem_service.env_reset() return to_local(remote_obs)
def __init__(self, problem_server): self._first_step = True self._problem_server = problem_server self._problem_service = problem_server.service spec = to_local(self._problem_service.get_env_spec()) self._spec = spec