Exemplo n.º 1
0
 def exposed_extend_replay(self, get_action, n_paths, no_plan=False):
     """Extend the replay buffer using the given policy (represented as a
     function from flattened observation vectors to action numbers).
     Optional argument `no_plan` can be used to disable planning, in
     which case this will just return success rates for rollouts without
     actually saving anything to internal replay buffer."""
     n_paths = to_local(n_paths)
     no_plan = to_local(no_plan)
     return self.internal_extend_replay(
         get_action, n_paths, no_plan=no_plan)
Exemplo n.º 2
0
    def _extend_replays(self, num_per_problem):
        """Extend the replays for //all// problems asynchronously."""
        # fire off extension methods
        results = []
        for problem in tqdm.tqdm(self.problems, desc='spawn extend'):
            get_action = self._make_get_action(problem, stochastic=True)
            extend_replay = rpyc.async_(problem.problem_service.extend_replay)
            result = extend_replay(
                get_action,
                num_per_problem,
                no_plan=bool(self.use_saved_training_set))
            # apparently I need to keep hold of async ref according to RPyC
            # docs (it's weak or s.th). Also, I need a background thread to
            # serve each environment's requests (...this may break things
            # slightly).
            bg_thread = rpyc.utils.helpers.BgServingThread(
                problem.problem_server.conn)
            results.append((problem, extend_replay, result, bg_thread))

        # Now we wait for results to come back. This is horribly inefficient
        # when some environments are much harder than others; oh well.
        succ_rates = []
        for problem, _, result, bg_thread in tqdm.tqdm(
                results, desc='wait extend'):
            succ_rates.append((problem, to_local(result.value)))
            # always shut down cleanly
            bg_thread.stop()

        return succ_rates
Exemplo n.º 3
0
 def inner(obs):
     obs = to_local(obs)
     # make sure it's 1D (need different strategy for batch
     # cache)
     assert obs.ndim == 1
     obs_bytes = obs.tostring()
     if obs_bytes not in cache:
         # sess.run() calls are all thread-safe
         act_dist = self.sess.run(
             policy.act_dist, feed_dict={policy.input_ph: obs[None]})[0]
         act_dist = to_local(act_dist)
         if stochastic:
             chosen = int(np.argmax(act_dist))
         else:
             act_dist = act_dist / np.sum(act_dist)
             chosen = np.random.choice(
                 np.arange(len(act_dist)), p=act_dist)
         # this cache update is actually thread-safe too thanks to
         # Python's GIL
         cache[obs_bytes] = chosen
     return cache[obs_bytes]
Exemplo n.º 4
0
 def __init__(self,
              problems,
              weight_manager,
              summary_writer,
              strategy,
              *,
              batch_size=64,
              lr=0.001,
              lr_steps=[],
              opt_batches_per_epoch=300,
              l1_reg_coeff,
              l2_reg_coeff,
              l1_l2_reg_coeff,
              target_rollouts_per_epoch,
              save_training_set=None,
              use_saved_training_set=None,
              hide_progress=False):
     # gets incremented to deal with TF
     self.batches_seen = 0
     self.problems = problems
     self.weight_manager = weight_manager
     # may be None if no summaries should be written
     self.summary_writer = summary_writer
     self.batch_size_per_problem = max(batch_size // len(problems), 1)
     self.opt_batches_per_epoch = opt_batches_per_epoch
     self.hide_progress = hide_progress
     self.strategy = strategy
     self.max_len = max(
         to_local(problem.problem_service.get_max_len())
         for problem in self.problems)
     self.tf_init_done = False
     self.lr = lr
     self.l1_reg_coeff = l1_reg_coeff
     self.l2_reg_coeff = l2_reg_coeff
     self.l1_l2_reg_coeff = l1_l2_reg_coeff
     self.target_rollouts_per_epoch = target_rollouts_per_epoch
     self.timer = TimerContext()
     self.save_training_set = save_training_set
     self.use_saved_training_set = use_saved_training_set
     if use_saved_training_set:
         print("Loading saved training set from '%s'" %
               use_saved_training_set)
         self.loaded_training_set = joblib.load(use_saved_training_set)
     lr_steps = [(0, lr)] + sorted(lr_steps)
     for k, lr in lr_steps:
         assert k >= 0, "one of the steps was negative (?)"
         assert isinstance(k, int), \
             "one of the LR step epoch nums (%s) was not an int" % (k, )
         assert lr > 0, \
             "one of the given learning rates was not positive (?)"
     self.lr_steps = lr_steps
     self.lr_steps_remaining = list(lr_steps)
     self._init_tf()
Exemplo n.º 5
0
    def _make_batches(self, n_batches):
        """Make a given number of batches for each problem."""
        batch_iters = []

        if self.save_training_set:
            to_save = {}

        for problem in self.problems:
            service = problem.problem_service

            if self.use_saved_training_set:
                assert not self.save_training_set, \
                    "saving training set & using a saved set are mutually " \
                    "exclusive options (doesn't make sense to write same " \
                    "dataset back out to disk!)"
                prob_obs_tensor, prob_qv_tensor, prob_counts \
                    = self.loaded_training_set[problem.name]
                it = weighted_batch_iter(
                    (prob_obs_tensor, prob_qv_tensor),
                    prob_counts,
                    self.batch_size_per_problem,
                    n_batches,
                )
                batch_iters.append(it)
                continue

            if service.dataset_is_empty():
                print("\nNo data for problem '%s' yet (teacher time-out?)" %
                      service.get_current_problem_name())
                batch_iters.append(repeat(None))
                if self.save_training_set:
                    to_save[problem.name] = None
            else:
                prob_obs_tensor, prob_qv_tensor, prob_counts \
                    = to_local(service.weighted_dataset())
                it = weighted_batch_iter(
                    (prob_obs_tensor, prob_qv_tensor),
                    prob_counts,
                    self.batch_size_per_problem,
                    n_batches,
                )
                batch_iters.append(it)
                if self.save_training_set:
                    to_save[problem.name] \
                        = (prob_obs_tensor, prob_qv_tensor, prob_counts)

        if self.save_training_set:
            print("\nSaving training set to '%s'" % self.save_training_set)
            dirname = os.path.dirname(self.save_training_set)
            if dirname:
                os.makedirs(dirname, exist_ok=True)
            joblib.dump(to_save, self.save_training_set)

        combined = zip(*batch_iters)

        # yield a complete feed dict
        for combined_batch in combined:
            assert len(combined_batch) == len(self.problems)
            yield_val = {}
            have_batch = False
            for problem, batch in zip(self.problems, combined_batch):
                ph_obs_var, ph_q_values = self.obs_qv_inputs[problem.name]
                if batch is None:
                    obs_tensor = empty_feed_value(ph_obs_var)
                    qv_tensor = empty_feed_value(ph_q_values)
                else:
                    obs_tensor, qv_tensor = batch
                    have_batch = True
                yield_val[ph_obs_var] = obs_tensor
                yield_val[ph_q_values] = qv_tensor
            assert have_batch, \
                "don't have any batches at all for training problems"
            yield yield_val
Exemplo n.º 6
0
    def _set_up_losses(self, single_prob_instance):
        # create two placeholders
        problem_service = single_prob_instance.problem_service
        policy = single_prob_instance.policy
        ph_obs_var = policy.input_ph
        act_dist = policy.act_dist
        act_dim = act_dist.get_shape().as_list()[1]
        ph_q_values = tf.placeholder(
            shape=[None, act_dim], name='q_values', dtype='float32')

        loss_parts = []

        # now the loss ops
        with tf.name_scope('loss'):
            if self.strategy == SupervisedObjective.ANY_GOOD_ACTION \
               or self.strategy == SupervisedObjective.THERE_CAN_ONLY_BE_ONE:
                best_qv = tf.reduce_min(ph_q_values, axis=-1, keepdims=True)
                # TODO: is 0.01 threshold too big? Hmm.
                act_labels = tf.cast(
                    tf.less(tf.abs(ph_q_values - best_qv), 0.01), 'float32')
                label_sum = tf.reduce_sum(act_labels, axis=-1, keepdims=True)
                act_label_dist = act_labels / tf.math.maximum(label_sum, 1.0)
                # zero out disabled or dead-end actions!
                dead_end_value = to_local(
                    problem_service.get_ssipp_dead_end_value())
                act_label_dist *= tf.cast(act_labels < dead_end_value,
                                          'float32')
                # this tf.cond() call ensures that this still works when batch
                # size is 0 (in which case it returns a loss of 0)
                xent = tf.cond(tf.size(act_label_dist) > 0,
                               true_fn=lambda: tf.reduce_mean(
                                   cross_entropy(act_dist, act_label_dist),
                                   name='xent_reduce'),
                               false_fn=lambda: tf.constant(
                                   0.0, dtype=tf.float32, name='xent_ph'),
                               name='xent_cond')
                loss_parts.append(('xent', xent))
            elif self.strategy == SupervisedObjective.MAX_ADVANTAGE:
                state_values = tf.reduce_min(ph_q_values, axis=-1)
                exp_q = act_dist * ph_q_values
                exp_vs = tf.reduce_sum(exp_q, axis=-1)
                # state value is irrelevant to objective, but is included
                # because it ensures that zero loss = optimal policy
                q_loss = tf.reduce_mean(exp_vs - state_values)
                loss_parts.append(('qloss', q_loss))
            else:
                raise ValueError("Unknown strategy %s" % self.strategy)

            # regularisation---we need this because the
            # logisitic-regression-like optimisation problem we're solving
            # generally has no minimum point otherwise
            weights = self.weight_manager.all_weights
            weights_no_bias = [w for w in weights if len(w.shape) > 1]
            weights_all_bias = [w for w in weights if len(w.shape) <= 1]
            # downweight regulariser penalty on biases (for most DL work
            # they're un-penalised, but here I think it pays to have *some*
            # penalty given that there are some problems that we can solve
            # perfectly)
            bias_coeff = 0.05
            if self.l2_reg_coeff:

                def do_l2_reg(lst):
                    return sum(map(tf.nn.l2_loss, lst))

                l2_reg = self.l2_reg_coeff * do_l2_reg(weights_no_bias) \
                    + bias_coeff * self.l2_reg_coeff \
                    * do_l2_reg(weights_all_bias)
                loss_parts.append(('l2reg', l2_reg))

            if self.l1_reg_coeff:

                def do_l1_reg(lst):
                    return sum(tf.linalg.norm(w, ord=1) for w in lst)

                l1_reg = self.l1_reg_coeff * do_l1_reg(weights_no_bias) \
                    + bias_coeff * self.l1_reg_coeff \
                    * do_l1_reg(weights_all_bias)
                loss_parts.append(('l1reg', l1_reg))

            if self.l1_l2_reg_coeff:
                all_weights_ap = []
                # act_weights[:-1] omits the last layer (which we don't want to
                # apply group sparsity penalty to)
                all_weights_ap.extend(self.weight_manager.act_weights[:-1])
                all_weights_ap.extend(self.weight_manager.prop_weights)
                l1_l2_reg_accum = 0.0
                for weight_dict in all_weights_ap:
                    for trans_mat, bias in weight_dict.values():
                        bias_size, = bias.shape.as_list()
                        tm_shape = trans_mat.shape.as_list()
                        # tm_shape[0] is always 1, tm_shape[1] is size of
                        # input, and tm_shape[2] is network channel count
                        assert len(tm_shape) == 3 and tm_shape[0] == 1 \
                            and tm_shape[2] == bias_size, "tm_shape %s does " \
                            "not match bias size %s" % (tm_shape, bias_size)
                        trans_square = tf.reduce_sum(
                            tf.square(trans_mat), reduction_indices=[0, 1])
                        bias_square = tf.square(bias)
                        norms = tf.sqrt(trans_square + bias_square)
                        l1_l2_reg_accum += tf.reduce_sum(norms)
                l1_l2_reg = self.l1_l2_reg_coeff * l1_l2_reg_accum
                loss_parts.append(('l1l2reg', l1_l2_reg))

            with tf.name_scope('combine_parts'):
                loss = sum(p[1] for p in loss_parts)

        return ph_obs_var, ph_q_values, loss, loss_parts
Exemplo n.º 7
0
 def _get_replay_sizes(self):
     """Get the sizes of replay buffers for each problem."""
     rv = []
     for problem in self.problems:
         rv.append(to_local(problem.problem_service.get_replay_size()))
     return rv
Exemplo n.º 8
0
 def exposed_env_step(self, action_num):
     action_num = to_local(action_num)
     next_cstate, step_cost \
         = sample_next_state(self.current_state, action_num, self.p)
     self.current_state = next_cstate
     return self.current_state, step_cost
Exemplo n.º 9
0
 def exposed_action_name(self, action_num):
     action_num = to_local(action_num)
     return get_action_name(self.p, action_num)