示例#1
0
    def __init__(self,
                 var_list,
                 *,
                 decay=0.9,
                 momentum=0.0,
                 epsilon=1e-08,
                 centered=False,
                 scale_grad_by_procs=True,
                 comm=None):
        self.var_list = var_list
        self.decay = decay
        self.momentum = momentum
        # self.beta1 = beta1
        # self.beta2 = beta2
        self.epsilon = epsilon
        self.centered = centered
        self.scale_grad_by_procs = scale_grad_by_procs
        size = sum(U.numel(v) for v in var_list)

        self.mean_square = np.zeros(size, 'float32')
        self.mom = np.zeros(size, 'float32')
        if centered:
            self.mean_grad = np.zeros(size, 'float32')
        self.t = 0
        self.setfromflat = U.SetFromFlat(var_list)
        self.getflat = U.GetFlat(var_list)
        self.comm = MPI.COMM_WORLD if comm is None else comm
 def __init__(self, var_list, *, scale_grad_by_procs=True, comm=None):
     self.var_list = var_list
     self.scale_grad_by_procs = scale_grad_by_procs
     self.t = 0
     self.setfromflat = U.SetFromFlat(var_list)
     self.getflat = U.GetFlat(var_list)
     self.comm = MPI.COMM_WORLD if comm is None else comm
示例#3
0
    def run(self):
        """Override Process.run()"""

        self.env.seed(os.getpid())
        set_global_seeds(os.getpid())
        self.policy.name = str(os.getpid()) + "policy"
        self.policy_tf, _ = self.policy(self.state_tf)

        # Start TF session
        with U.single_threaded_session() as sess:
            pi = make_pi(self.policy_tf, sess, self.state_tf, self.n_actions)
            set_parameters = U.SetFromFlat(self.policy.trainable_vars,
                                           dtype=self.dtype)
            # Build the sampling logic fn
            sampling_fn = make_sampling_fn(pi, self.env, self.nb_steps,
                                           self.n_actions)

            # Start sampling-worker loop.
            while True:
                # self.event.wait()  # Wait for a new message
                # self.event.clear()  # Upon message receipt, mark as read
                message, policy_ws, theta = self.inputQ.get()  # Pop message
                self.env.set_params(theta)
                if message == "sample":
                    # Set weights
                    set_parameters(policy_ws)
                    # Do sampling
                    stats = sampling_fn()
                    self.outputQ.put((os.getpid(), stats))

                elif message == "exit":
                    print("[Worker {}] Exiting...".format(os.getpid()))
                    self.env.close()
                    break
    def run(self):

        sess = U.single_threaded_session()
        sess.__enter__()

        env = self.make_env(self.index)
        self.env = env
        env.reset()
        workerseed = self.seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_all_seeds(workerseed)
        env.seed(workerseed)
        scope = 'pi%s' % os.getpid()
        pi = self.make_pi(scope, env)
        var = get_pi_trainable_variables(scope)
        set_from_flat = U.SetFromFlat(var)

        print('Worker %s - Running with seed %s' % (os.getpid(), workerseed))

        while True:
            self.event.wait()
            self.event.clear()
            command, weights = self.input.get()
            if command == 'collect':
                set_from_flat(weights)
                #print('Worker %s - Collecting...' % os.getpid())
                samples = self.traj_segment_generator(pi, env)
                self.output.put((os.getpid(), samples))
            elif command == 'exit':
                print('Worker %s - Exiting...' % os.getpid())
                env.close()
                sess.close()
                break
示例#5
0
文件: cnn_policy.py 项目: Umbi14/pois
    def _prepare_getsetters(self):
        with tf.variable_scope('pol') as vs:
            self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \
                                         scope=vs.name)

        self.get_parameter = U.GetFlat(self.var_list)
        self.set_parameter = U.SetFromFlat(self.var_list)
示例#6
0
 def __init__(self, var_list, beta1=0.9, beta2=0.999, epsilon=1e-08):
     self.var_list = var_list
     self.beta1 = beta1
     self.beta2 = beta2
     self.epsilon = epsilon
     size = sum(U.numel(v) for v in var_list)
     self.m = np.zeros(size, 'float32')
     self.v = np.zeros(size, 'float32')
     self.t = 0
     self.setfromflat = U.SetFromFlat(var_list)
     self.getflat = U.GetFlat(var_list)
示例#7
0
 def __init__(self, value):
     assert MPI, "MPI is not loaded"
     self.value = value
     self.comm = MPI.COMM_WORLD
     self.status = MPI.Status()
     self.rank = self.comm.Get_rank()
     var_list = value.get_variables_for_broadcast()
     self.setfromflat = tf_util.SetFromFlat(
         var_list)  # , dtype=var_list[0].dtype)
     self.getflat = tf_util.GetFlat(var_list)
     self.count_updates = 1
     self.shared_weights = None
示例#8
0
 def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
     self.var_list = var_list
     self.beta1 = beta1
     self.beta2 = beta2
     self.epsilon = epsilon
     self.scale_grad_by_procs = scale_grad_by_procs
     size = sum(U.numel(v) for v in var_list)
     self.m = np.zeros(size, 'float32')
     self.v = np.zeros(size, 'float32')
     self.t = 0
     self.setfromflat = U.SetFromFlat(var_list)
     self.getflat = U.GetFlat(var_list)
     self.comm = MPI.COMM_WORLD if comm is None else comm
    def __init__(self, name, reuse=False, *args, **kwargs):
        with tf.variable_scope(name):
            if reuse:
                tf.get_variable_scope().reuse_variables()
            self._init(*args, **kwargs)
            self.scope = tf.get_variable_scope().name

        #print(self.scope)
        #set up for functions to get and set policy parameters:
        var = self.get_trainable_variables()
        var_list = [v for v in var]
        self.flatten_var = tf.concat(
            axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in var_list])

        self.get_flat = U.GetFlat(var_list)
        self.set_from_flat = U.SetFromFlat(var_list)

        self.setupadam(self.l2_reg)
示例#10
0
 def __init__(self, var_list, **_3to2kwargs):
     if 'comm' in _3to2kwargs:
         comm = _3to2kwargs['comm']
         del _3to2kwargs['comm']
     else:
         comm = None
     if 'scale_grad_by_procs' in _3to2kwargs:
         scale_grad_by_procs = _3to2kwargs['scale_grad_by_procs']
         del _3to2kwargs['scale_grad_by_procs']
     else:
         scale_grad_by_procs = True
     if 'epsilon' in _3to2kwargs:
         epsilon = _3to2kwargs['epsilon']
         del _3to2kwargs['epsilon']
     else:
         epsilon = 1e-08
     if 'beta2' in _3to2kwargs:
         beta2 = _3to2kwargs['beta2']
         del _3to2kwargs['beta2']
     else:
         beta2 = 0.999
     if 'beta1' in _3to2kwargs:
         beta1 = _3to2kwargs['beta1']
         del _3to2kwargs['beta1']
     else:
         beta1 = 0.9
     self.var_list = var_list
     self.beta1 = beta1
     self.beta2 = beta2
     self.epsilon = epsilon
     self.scale_grad_by_procs = scale_grad_by_procs
     size = sum(U.numel(v) for v in var_list)
     self.m = np.zeros(size, u'float32')
     self.v = np.zeros(size, u'float32')
     self.t = 0
     self.setfromflat = U.SetFromFlat(var_list)
     self.getflat = U.GetFlat(var_list)
     self.comm = MPI.COMM_WORLD if comm is None else comm
示例#11
0
    def __init__(self,
                 var_list,
                 *,
                 beta1=0.9,
                 beta2=0.999,
                 epsilon=1e-08,
                 scale_grad_by_procs=True,
                 comm=None,
                 sess=None):
        """
        A parallel MPI implementation of the Adam optimizer for TensorFlow
        https://arxiv.org/abs/1412.6980

        :param var_list: ([TensorFlow Tensor]) the variables
        :param beta1: (float) Adam beta1 parameter
        :param beta2: (float) Adam beta1 parameter
        :param epsilon: (float) to help with preventing arithmetic issues
        :param scale_grad_by_procs: (bool) if the scaling should be done by processes
        :param comm: (MPI Communicators) if None, MPI.COMM_WORLD
        :param sess: (TensorFlow Session) if None, tf.get_default_session()
        """
        self.var_list = var_list
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.scale_grad_by_procs = scale_grad_by_procs
        size = sum(tf_utils.numel(v) for v in var_list)
        # Exponential moving average of gradient values
        # "first moment estimate" m in the paper
        self.exp_avg = np.zeros(size, 'float32')
        # Exponential moving average of squared gradient values
        # "second raw moment estimate" v in the paper
        self.exp_avg_sq = np.zeros(size, 'float32')
        self.step = 0
        self.setfromflat = tf_utils.SetFromFlat(var_list, sess=sess)
        self.getflat = tf_utils.GetFlat(var_list, sess=sess)
        self.comm = MPI.COMM_WORLD if comm is None else comm
    def __init__(self, policy_params):

        self.ob_dim = policy_params['ob_dim']
        self.ac_dim = policy_params['ac_dim']
        self.policy = MlpPolicy('pi',
                                policy_params['ob_space'],
                                policy_params['ac_space'],
                                hid_size=64,
                                num_hid_layers=2)
        all_var_list = self.policy.get_trainable_variables()
        var_list = [
            v for v in all_var_list if v.name.split("/")[1].startswith("pol")
        ]
        self.get_flat = U.GetFlat(var_list)
        self.set_from_flat = U.SetFromFlat(var_list)
        sess = tf.get_default_session()
        U.initialize()
        #print('sess',sess)
        #sess.run(tf.global_variables_initializer())
        self.weights = self.get_flat()
        # a filter for updating statistics of the observations and normalizing inputs to the policies
        self.observation_filter = get_filter(policy_params['ob_filter'],
                                             shape=(self.ob_dim, ))
        self.update_filter = True
示例#13
0
def learn(
        env,
        make_policy,
        *,
        n_episodes,
        horizon,
        delta,
        gamma,
        max_iters,
        sampler=None,
        use_natural_gradient=False,  #can be 'exact', 'approximate'
        fisher_reg=1e-2,
        iw_method='is',
        iw_norm='none',
        bound='J',
        line_search_type='parabola',
        save_weights=0,
        improvement_tol=0.,
        center_return=False,
        render_after=None,
        max_offline_iters=100,
        callback=None,
        clipping=False,
        entropy='none',
        positive_return=False,
        reward_clustering='none',
        capacity=10,
        warm_start=True):

    np.set_printoptions(precision=3)
    max_samples = horizon * n_episodes

    if line_search_type == 'binary':
        line_search = line_search_binary
    elif line_search_type == 'parabola':
        line_search = line_search_parabola
    else:
        raise ValueError()

    # Building the environment
    ob_space = env.observation_space
    ac_space = env.action_space

    # Creating the memory buffer
    memory = Memory(capacity=capacity,
                    batch_size=n_episodes,
                    horizon=horizon,
                    ob_space=ob_space,
                    ac_space=ac_space)

    # Building the target policy and saving its parameters
    pi = make_policy('pi', ob_space, ac_space)
    all_var_list = pi.get_trainable_variables()
    var_list = [
        v for v in all_var_list if v.name.split('/')[1].startswith('pol')
    ]
    shapes = [U.intprod(var.get_shape().as_list()) for var in var_list]
    n_parameters = sum(shapes)

    # Building a set of behavioral policies
    behavioral_policies = memory.build_policies(make_policy, pi)

    # Placeholders
    ob_ = ob = U.get_placeholder_cached(name='ob')
    ac_ = pi.pdtype.sample_placeholder([None], name='ac')
    mask_ = tf.placeholder(dtype=tf.float32, shape=(None), name='mask')
    rew_ = tf.placeholder(dtype=tf.float32, shape=(None), name='rew')
    disc_rew_ = tf.placeholder(dtype=tf.float32, shape=(None), name='disc_rew')
    clustered_rew_ = tf.placeholder(dtype=tf.float32, shape=(None))
    gradient_ = tf.placeholder(dtype=tf.float32,
                               shape=(n_parameters, 1),
                               name='gradient')
    iter_number_ = tf.placeholder(dtype=tf.int32, name='iter_number')
    active_policies = tf.placeholder(dtype=tf.float32,
                                     shape=(capacity),
                                     name='active_policies')
    losses_with_name = []

    # Total number of trajectories
    N_total = tf.reduce_sum(active_policies) * n_episodes

    # Split operations
    disc_rew_split = tf.reshape(disc_rew_ * mask_, [-1, horizon])
    rew_split = tf.reshape(rew_ * mask_, [-1, horizon])
    mask_split = tf.reshape(mask_, [-1, horizon])

    # Policy densities
    target_log_pdf = pi.pd.logp(ac_) * mask_
    target_log_pdf_split = tf.reshape(target_log_pdf, [-1, horizon])
    behavioral_log_pdfs = tf.stack([
        bpi.pd.logp(ac_) * mask_ for bpi in memory.policies
    ])  # Shape is (capacity, ntraj*horizon)
    behavioral_log_pdfs_split = tf.reshape(behavioral_log_pdfs,
                                           [memory.capacity, -1, horizon])

    # Compute renyi divergencies and sum over time, then exponentiate
    emp_d2_split = tf.reshape(
        tf.stack([pi.pd.renyi(bpi.pd, 2) * mask_ for bpi in memory.policies]),
        [memory.capacity, -1, horizon])
    emp_d2_split_cum = tf.exp(tf.reduce_sum(emp_d2_split, axis=2))
    # Compute arithmetic and harmonic mean of emp_d2
    emp_d2_mean = tf.reduce_mean(emp_d2_split_cum, axis=1)
    emp_d2_arithmetic = tf.reduce_sum(
        emp_d2_mean * active_policies) / tf.reduce_sum(active_policies)
    emp_d2_harmonic = tf.reduce_sum(active_policies) / tf.reduce_sum(
        1 / emp_d2_mean)

    # Return processing: clipping, centering, discounting
    ep_return = clustered_rew_  #tf.reduce_sum(mask_split * disc_rew_split, axis=1)
    if clipping:
        rew_split = tf.clip_by_value(rew_split, -1, 1)
    if center_return:
        ep_return = ep_return - tf.reduce_mean(ep_return)
        rew_split = rew_split - (tf.reduce_sum(rew_split) /
                                 (tf.reduce_sum(mask_split) + 1e-24))
    discounter = [pow(gamma, i) for i in range(0, horizon)]  # Decreasing gamma
    discounter_tf = tf.constant(discounter)
    disc_rew_split = rew_split * discounter_tf

    # Reward statistics
    return_mean = tf.reduce_mean(ep_return)
    return_std = U.reduce_std(ep_return)
    return_max = tf.reduce_max(ep_return)
    return_min = tf.reduce_min(ep_return)
    return_abs_max = tf.reduce_max(tf.abs(ep_return))
    return_step_max = tf.reduce_max(tf.abs(rew_split))  # Max step reward
    return_step_mean = tf.abs(tf.reduce_mean(rew_split))
    positive_step_return_max = tf.maximum(0.0, tf.reduce_max(rew_split))
    negative_step_return_max = tf.maximum(0.0, tf.reduce_max(-rew_split))
    return_step_maxmin = tf.abs(positive_step_return_max -
                                negative_step_return_max)
    losses_with_name.extend([(return_mean, 'InitialReturnMean'),
                             (return_max, 'InitialReturnMax'),
                             (return_min, 'InitialReturnMin'),
                             (return_std, 'InitialReturnStd'),
                             (emp_d2_arithmetic, 'EmpiricalD2Arithmetic'),
                             (emp_d2_harmonic, 'EmpiricalD2Harmonic'),
                             (return_step_max, 'ReturnStepMax'),
                             (return_step_maxmin, 'ReturnStepMaxmin')])

    if iw_method == 'is':
        # Sum the log prob over time. Shapes: target(Nep, H), behav (Cap, Nep, H)
        target_log_pdf_episode = tf.reduce_sum(target_log_pdf_split, axis=1)
        behavioral_log_pdf_episode = tf.reduce_sum(behavioral_log_pdfs_split,
                                                   axis=2)
        # To avoid numerical instability, compute the inversed ratio
        log_ratio = target_log_pdf_split - behavioral_log_pdfs_split
        inverse_log_ratio_episode = -tf.reduce_sum(log_ratio, axis=2)

        iw = 1 / tf.reduce_sum(tf.exp(inverse_log_ratio_episode) *
                               tf.expand_dims(active_policies, -1),
                               axis=0)

        # Compute also the balance-heuristic weights
        iw_split = tf.reshape(iw, (memory.capacity, -1))
        iw_by_behavioral = tf.reduce_mean(iw_split, axis=1)
        losses_with_name.append(
            (iw_by_behavioral[0] / tf.reduce_sum(iw_by_behavioral),
             'MultiIWFirstRatio'))
        losses_with_name.append(
            (tf.reduce_max(iw_by_behavioral), 'MultiIWMax'))
        losses_with_name.append(
            (tf.reduce_sum(iw_by_behavioral), 'MultiIWSum'))
        losses_with_name.append(
            (tf.reduce_min(iw_by_behavioral), 'MultiIWMin'))

        # Get the probability by exponentiation
        #target_pdf_episode = tf.exp(target_log_pdf_episode)
        #behavioral_pdf_episode = tf.exp(behavioral_log_pdf_episode)
        # Get the denominator by averaging over behavioral policies
        #behavioral_pdf_mixture = tf.reduce_mean(behavioral_pdf_episode, axis=0) + 1e-24
        #iw = target_pdf_episode / behavioral_pdf_mixture
        iwn = iw / n_episodes

        # Compute the J
        w_return_mean = tf.reduce_sum(ep_return * iwn)
        # Empirical D2 of the mixture and relative ESS
        ess_renyi_arithmetic = N_total / emp_d2_arithmetic
        ess_renyi_harmonic = N_total / emp_d2_harmonic
        # Log quantities
        losses_with_name.extend([
            (tf.reduce_max(iw), 'MaxIW'), (tf.reduce_min(iw), 'MinIW'),
            (tf.reduce_mean(iw), 'MeanIW'), (U.reduce_std(iw), 'StdIW'),
            (tf.reduce_min(target_log_pdf_episode), 'MinTargetPdf'),
            (tf.reduce_min(behavioral_log_pdf_episode), 'MinBehavPdf'),
            (ess_renyi_arithmetic, 'ESSRenyiArithmetic'),
            (ess_renyi_harmonic, 'ESSRenyiHarmonic')
        ])
    else:
        raise NotImplementedError()

    if bound == 'J':
        bound_ = w_return_mean
    elif bound == 'max-d2-harmonic':
        bound_ = w_return_mean - tf.sqrt(
            (1 - delta) / (delta * ess_renyi_harmonic)) * return_abs_max
    elif bound == 'max-d2-arithmetic':
        bound_ = w_return_mean - tf.sqrt(
            (1 - delta) / (delta * ess_renyi_arithmetic)) * return_abs_max
    else:
        raise NotImplementedError()

    # Policy entropy for exploration
    ent = pi.pd.entropy()
    meanent = tf.reduce_mean(ent)
    losses_with_name.append((meanent, 'MeanEntropy'))
    # Add policy entropy bonus
    if entropy != 'none':
        scheme, v1, v2 = entropy.split(':')
        if scheme == 'step':
            entcoeff = tf.cond(iter_number_ < int(v2), lambda: float(v1),
                               lambda: float(0.0))
            losses_with_name.append((entcoeff, 'EntropyCoefficient'))
            entbonus = entcoeff * meanent
            bound_ = bound_ + entbonus
        elif scheme == 'lin':
            ip = tf.cast(iter_number_ / max_iters, tf.float32)
            entcoeff_decay = tf.maximum(
                0.0,
                float(v2) + (float(v1) - float(v2)) * (1.0 - ip))
            losses_with_name.append((entcoeff_decay, 'EntropyCoefficient'))
            entbonus = entcoeff_decay * meanent
            bound_ = bound_ + entbonus
        elif scheme == 'exp':
            ent_f = tf.exp(
                -tf.abs(tf.reduce_mean(iw) - 1) * float(v2)) * float(v1)
            losses_with_name.append((ent_f, 'EntropyCoefficient'))
            bound_ = bound_ + ent_f * meanent
        else:
            raise Exception('Unrecognized entropy scheme.')

    losses_with_name.append((w_return_mean, 'ReturnMeanIW'))
    losses_with_name.append((bound_, 'Bound'))
    losses, loss_names = map(list, zip(*losses_with_name))
    '''
    if use_natural_gradient:
        p = tf.placeholder(dtype=tf.float32, shape=[None])
        target_logpdf_episode = tf.reduce_sum(target_log_pdf_split * mask_split, axis=1)
        grad_logprob = U.flatgrad(tf.stop_gradient(iwn) * target_logpdf_episode, var_list)
        dot_product = tf.reduce_sum(grad_logprob * p)
        hess_logprob = U.flatgrad(dot_product, var_list)
        compute_linear_operator = U.function([p, ob_, ac_, disc_rew_, mask_], [-hess_logprob])
    '''

    assert_ops = tf.group(*tf.get_collection('asserts'))
    print_ops = tf.group(*tf.get_collection('prints'))

    compute_lossandgrad = U.function([
        ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_,
        active_policies
    ], losses + [U.flatgrad(bound_, var_list), assert_ops, print_ops])
    compute_grad = U.function([
        ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_,
        active_policies
    ], [U.flatgrad(bound_, var_list), assert_ops, print_ops])
    compute_bound = U.function([
        ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_,
        active_policies
    ], [bound_, assert_ops, print_ops])
    compute_losses = U.function([
        ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_,
        active_policies
    ], losses)
    #compute_temp = U.function([ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies], [log_inverse_ratio, abc, iw])

    set_parameter = U.SetFromFlat(var_list)
    get_parameter = U.GetFlat(var_list)
    policy_reinit = tf.variables_initializer(var_list)

    if sampler is None:
        seg_gen = traj_segment_generator(pi,
                                         env,
                                         n_episodes,
                                         horizon,
                                         stochastic=True,
                                         gamma=gamma)
        sampler = type("SequentialSampler", (object, ), {
            "collect": lambda self, _: seg_gen.__next__()
        })()

    U.initialize()

    # Starting optimizing
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=n_episodes)
    rewbuffer = deque(maxlen=n_episodes)

    while True:

        iters_so_far += 1

        if render_after is not None and iters_so_far % render_after == 0:
            if hasattr(env, 'render'):
                render(env, pi, horizon)

        if callback:
            callback(locals(), globals())

        if iters_so_far >= max_iters:
            print('Finished...')
            break

        logger.log('********** Iteration %i ************' % iters_so_far)

        theta = get_parameter()

        with timed('sampling'):
            seg = sampler.collect(theta)

        lens, rets = seg['ep_lens'], seg['ep_rets']
        lenbuffer.extend(lens)
        rewbuffer.extend(rets)
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)

        # Adding batch of trajectories to memory
        memory.add_trajectory_batch(seg)

        # Get multiple batches from memory
        seg_with_memory = memory.get_trajectories()

        # Get clustered reward
        reward_matrix = np.reshape(
            seg_with_memory['disc_rew'] * seg_with_memory['mask'],
            (-1, horizon))
        ep_reward = np.sum(reward_matrix, axis=1)
        ep_reward = cluster_rewards(ep_reward, reward_clustering)

        args = ob, ac, rew, disc_rew, clustered_rew, mask, iter_number, active_policies = (
            seg_with_memory['ob'], seg_with_memory['ac'],
            seg_with_memory['rew'], seg_with_memory['disc_rew'], ep_reward,
            seg_with_memory['mask'], iters_so_far,
            memory.get_active_policies_mask())

        def evaluate_loss():
            loss = compute_bound(*args)
            return loss[0]

        def evaluate_gradient():
            gradient = compute_grad(*args)
            return gradient[0]

        if use_natural_gradient:

            def evaluate_fisher_vector_prod(x):
                return compute_linear_operator(x, *args)[0] + fisher_reg * x

            def evaluate_natural_gradient(g):
                return cg(evaluate_fisher_vector_prod,
                          g,
                          cg_iters=10,
                          verbose=0)
        else:
            evaluate_natural_gradient = None

        with timed('summaries before'):
            logger.record_tabular("Iteration", iters_so_far)
            logger.record_tabular("InitialBound", evaluate_loss())
            logger.record_tabular("EpLenMean", np.mean(lenbuffer))
            logger.record_tabular("EpRewMean", np.mean(rewbuffer))
            logger.record_tabular("EpThisIter", len(lens))
            logger.record_tabular("EpisodesSoFar", episodes_so_far)
            logger.record_tabular("TimestepsSoFar", timesteps_so_far)
            logger.record_tabular("TimeElapsed", time.time() - tstart)

        if save_weights > 0 and iters_so_far % save_weights == 0:
            logger.record_tabular('Weights', str(get_parameter()))
            import pickle
            file = open('checkpoint' + str(iters_so_far) + '.pkl', 'wb')
            pickle.dump(theta, file)

        if not warm_start or memory.get_current_load() == capacity:
            # Optimize
            with timed("offline optimization"):
                theta, improvement = optimize_offline(
                    theta,
                    set_parameter,
                    line_search,
                    evaluate_loss,
                    evaluate_gradient,
                    evaluate_natural_gradient,
                    max_offline_ite=max_offline_iters)

            set_parameter(theta)
            print(theta)

            with timed('summaries after'):
                meanlosses = np.array(compute_losses(*args))
                for (lossname, lossval) in zip(loss_names, meanlosses):
                    logger.record_tabular(lossname, lossval)
        else:
            # Reinitialize the policy
            tf.get_default_session().run(policy_reinit)

        logger.dump_tabular()

    env.close()
示例#14
0
def learn(base_env,
          policy_fn, *,
          max_fitness,  # has to be negative, as cmaes consider minization
          popsize,
          gensize,
          bounds,
          sigma,
          eval_iters,
          timesteps_per_actorbatch,
          max_timesteps = 0,
          max_episodes = 0,
          max_iters = 0,
          max_seconds = 0,
          seed = 0
          ):
    set_global_seeds(seed)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = base_env.observation_space
    ac_space = base_env.action_space
    pi = policy_fn("pi", ob_space, ac_space)  # Construct network for new policy
    backup_pi = policy_fn("backup_pi", ob_space, ac_space)  # Construct a network for every individual to adapt during the es evolution

    U.initialize()
    pi_set_from_flat_params = U.SetFromFlat(pi.get_trainable_variables())
    pi_get_flat_params = U.GetFlat(pi.get_trainable_variables())

    global timesteps_so_far, episodes_so_far, iters_so_far, \
        tstart, lenbuffer, rewbuffer,best_fitness, eval_seq
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen = 100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen = 100)  # rolling buffer for episode rewards


    assign_backup_eq_new = U.function([], [], updates = [tf.assign(backup_v, newv)
                                                      for (backup_v, newv) in zipsame(
            backup_pi.get_variables(), pi.get_variables())])
    assign_new_eq_backup = U.function([], [], updates = [tf.assign(newv, backup_v)
                                                      for (newv, backup_v) in zipsame(
            pi.get_variables(), backup_pi.get_variables())])


    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0,
                max_seconds > 0]) == 1, "Only one time constraint permitted"

    # Build generator for all solutions
    actors = []
    best_fitness = -np.inf

    eval_seq = traj_segment_generator_eval(pi, base_env,
                                          timesteps_per_actorbatch,
                                          stochastic = True)
    for i in range(popsize):
        newActor = traj_segment_generator(pi, base_env,
                                          timesteps_per_actorbatch,
                                          stochastic = True,
                                          eval_iters = eval_iters)
        actors.append(newActor)

    flatten_weights = pi_get_flat_params()
    opt = cma.CMAOptions()
    opt['tolfun'] = max_fitness
    opt['popsize'] = popsize
    opt['maxiter'] = gensize
    opt['verb_disp'] = 0
    opt['verb_log'] = 0
    # opt['seed'] = seed
    opt['AdaptSigma'] = True
    # opt['bounds'] = bounds
    es = cma.CMAEvolutionStrategy(flatten_weights,
                                  sigma, opt)
    costs = None
    best_solution = None

    while True:
        if max_timesteps and timesteps_so_far >= max_timesteps:
            logger.log("Max time steps")
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            logger.log("Max episodes")
            break
        elif max_iters and iters_so_far >= max_iters:
            logger.log("Max iterations")
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            logger.log("Max time")
            break
        elif es.countiter >= opt['maxiter']:
            logger.log("Max generations")
            break
        assign_backup_eq_new() # backup current policy

        logger.log("********** Generation %i ************" % iters_so_far)
        eval_seg = eval_seq.__next__()
        rewbuffer.extend(eval_seg["ep_rets"])
        lenbuffer.extend(eval_seg["ep_lens"])
        if iters_so_far == 0:
            result_record()

        solutions = es.ask()
        if costs is not None:
            solutions[np.argmax(costs)] = np.copy(best_solution)
        ob_segs = None
        segs = []
        costs = []
        lens = []
        for id, solution in enumerate(solutions):
            # pi.set_Flat_variables(solution)
            pi_set_from_flat_params(solution)
            seg = actors[id].__next__()
            costs.append(-np.mean(seg["ep_rets"]))
            lens.append(np.sum(seg["ep_lens"]))
            segs.append(seg)
            if ob_segs is None:
                ob_segs = {'ob': np.copy(seg['ob'])}
            else:
                ob_segs['ob'] = np.append(ob_segs['ob'], seg['ob'], axis=0)
            assign_new_eq_backup()
        fit_idx = np.array(costs).flatten().argsort()[:len(costs)]
        solutions = np.array(solutions)[fit_idx]
        costs = np.array(costs)[fit_idx]
        segs = np.array(segs)[fit_idx]
        # Weights decay
        # costs, real_costs = fitness_shift(costs)
        # costs, real_costs = compute_centered_ranks(costs)
        l2_decay = compute_weight_decay(0.01, solutions)
        costs += l2_decay
        costs, real_costs = fitness_normalization(costs)
        # best_solution = np.copy(solutions[0])
        # best_fitness = -real_costs[0]
        # rewbuffer.extend(segs[0]["ep_rets"])
        # lenbuffer.extend(segs[0]["ep_lens"])
        es.tell_real_seg(solutions = solutions, function_values = costs, real_f = real_costs, segs = segs)
        best_solution = np.copy(es.result[0])
        best_fitness = -es.result[1]
        rewbuffer.extend(es.result[3]["ep_rets"])
        lenbuffer.extend(es.result[3]["ep_lens"])
        logger.log("Generation:", es.countiter)
        logger.log("Best Solution Fitness:", best_fitness)
        pi_set_from_flat_params(best_solution)

        ob = ob_segs["ob"]
        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob)  # update running mean/std for observation normalization

        iters_so_far += 1
        episodes_so_far += sum(lens)
示例#15
0
 def set_Layer_Flat_variables(self, old_var_list, var_list):
     set_from_flat = U.SetFromFlat(old_var_list)
     set_from_flat(var_list)
示例#16
0
 def set_Flat_variables(self, var_list):
     set_from_flat = U.SetFromFlat(self.get_trainable_variables())
     set_from_flat(var_list)
示例#17
0
def learn(
        env,
        policy_fn,
        *,
        timesteps_per_actorbatch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        # CMAES
    max_fitness,  # has to be negative, as cmaes consider minization
        popsize,
        gensize,
        bounds,
        sigma,
        eval_iters,
        max_v_train_iter,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,
        # time constraint
        callback=None,
        # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',
        # annealing for stepsize parameters (epsilon and adam)
        seed,
        env_id):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    backup_pi = policy_fn(
        "backup_pi", ob_space, ac_space
    )  # Construct a network for every individual to adapt during the es evolution

    pi_params = tf.placeholder(dtype=tf.float32, shape=[None])
    old_pi_params = tf.placeholder(dtype=tf.float32, shape=[None])
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    layer_clip = tf.placeholder(
        name='layer_clip', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    bound_coeff = tf.placeholder(
        name='bound_coeff', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    clip_param = clip_param * lrmult * layer_clip  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - (oldpi.pd.logp(ac) + 1e-8))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    vf_losses = [vf_loss]
    vf_loss_names = ["vf_loss"]

    pol_loss = pol_surr + pol_entpen
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    vf_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("vf")
    ]
    pol_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("pol")
    ]

    layer_var_list = []
    for i in range(pi.num_hid_layers):
        layer_var_list.append([
            v for v in pol_var_list
            if v.name.split("/")[2].startswith('fc%i' % (i + 1))
        ])
    logstd_var_list = [
        v for v in pol_var_list if v.name.split("/")[2].startswith("logstd")
    ]
    if len(logstd_var_list) != 0:
        layer_var_list.append([
            v for v in pol_var_list if v.name.split("/")[2].startswith("final")
        ] + logstd_var_list)

    vf_lossandgrad = U.function([ob, ac, ret, lrmult],
                                vf_losses + [U.flatgrad(vf_loss, vf_var_list)])

    lossandgrad = U.function([ob, ac, atarg, ret, lrmult, layer_clip],
                             losses + [U.flatgrad(total_loss, var_list)])

    vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon)
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    assign_backup_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(backup_v, newv) for (
                backup_v,
                newv) in zipsame(backup_pi.get_variables(), pi.get_variables())
        ])
    assign_new_eq_backup = U.function(
        [], [],
        updates=[
            tf.assign(newv, backup_v)
            for (newv, backup_v
                 ) in zipsame(pi.get_variables(), backup_pi.get_variables())
        ])
    # Compute all losses

    compute_pol_losses = U.function([ob, ac, atarg, ret, lrmult, layer_clip],
                                    [pol_loss, pol_surr, pol_entpen, meankl])

    compute_v_pred = U.function([ob], [pi.vpred])

    a_prob = tf.exp(pi.pd.logp(ac))
    compute_a_prob = U.function([ob, ac], [a_prob])

    U.initialize()

    layer_set_operate_list = []
    layer_get_operate_list = []
    for var in layer_var_list:
        set_pi_layer_flat_params = U.SetFromFlat(var)
        layer_set_operate_list.append(set_pi_layer_flat_params)
        get_pi_layer_flat_params = U.GetFlat(var)
        layer_get_operate_list.append(get_pi_layer_flat_params)

    # get_pi_layer_flat_params = U.GetFlat(pol_var_list)
    # set_pi_layer_flat_params = U.SetFromFlat(pol_var_list)

    vf_adam.sync()

    adam.sync()

    global timesteps_so_far, episodes_so_far, iters_so_far, \
        tstart, lenbuffer, rewbuffer, tstart, ppo_timesteps_so_far, best_fitness

    episodes_so_far = 0
    timesteps_so_far = 0
    ppo_timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    best_fitness = -np.inf

    eval_seq = traj_segment_generator_eval(pi,
                                           env,
                                           timesteps_per_actorbatch,
                                           stochastic=False)
    # eval_gen = traj_segment_generator_eval(pi, test_env, timesteps_per_actorbatch, stochastic = True)  # For evaluation
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True,
                                     eval_seq=eval_seq)  # For train V Func

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    indices = []  # maintain all selected indices for each iteration

    opt = cma.CMAOptions()
    opt['tolfun'] = max_fitness
    opt['popsize'] = popsize
    opt['maxiter'] = gensize
    opt['verb_disp'] = 0
    opt['verb_log'] = 0
    # opt['seed'] = seed
    opt['AdaptSigma'] = True
    # opt['bounds'] = bounds
    # opt['tolstagnation'] = 20
    ess = []
    seg = None
    segs = None
    sum_vpred = []
    while True:
        if max_timesteps and timesteps_so_far >= max_timesteps:
            print("Max time steps")
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            print("Max episodes")
            break
        elif max_iters and iters_so_far >= max_iters:
            print("Max iterations")
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            print("Max time")
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / (max_timesteps),
                             0)
        else:
            raise NotImplementedError

        # epsilon = max(0.5 - float(timesteps_so_far) / (max_timesteps), 0) * cur_lrmult
        epsilon = max(0.5 * cur_lrmult, 0)
        # epsilon = 0.2
        sigma_adapted = max(sigma * cur_lrmult, 1e-8)
        # sigma_adapted = max(max(sigma - float(timesteps_so_far) / (5000 * max_timesteps), 0) * cur_lrmult, 1e-8)
        # cmean_adapted = max(1.0 - float(timesteps_so_far) / (max_timesteps), 1e-8)
        # cmean_adapted = max(0.8 - float(time˚steps_so_far) / (2*max_timesteps), 1e-8)
        # if timesteps_so_far % max_timesteps == 10:
        max_v_train_iter = int(
            max(
                max_v_train_iter * (1 - timesteps_so_far /
                                    (0.5 * max_timesteps)), 1))
        logger.log("********** Iteration %i ************" % iters_so_far)
        if iters_so_far == 0:
            eval_seg = eval_seq.__next__()
            rewbuffer.extend(eval_seg["ep_rets"])
            lenbuffer.extend(eval_seg["ep_lens"])
            result_record()

        # Repository Train
        train_segs = {}
        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(
                seg["ob"])  # update running mean/std for normalization

        # rewbuffer.extend(seg["ep_rets"])
        # lenbuffer.extend(seg["ep_lens"])
        #
        # if iters_so_far == 0:
        #     result_record()

        assign_old_eq_new()  # set old parameter values to new parameter values
        if segs is None:
            segs = seg
            segs["v_target"] = np.zeros(len(seg["ob"]), 'float32')
        elif len(segs["ob"]) >= 50000:
            segs["ob"] = np.take(segs["ob"],
                                 np.arange(timesteps_per_actorbatch,
                                           len(segs["ob"])),
                                 axis=0)
            segs["next_ob"] = np.take(segs["next_ob"],
                                      np.arange(timesteps_per_actorbatch,
                                                len(segs["next_ob"])),
                                      axis=0)
            segs["ac"] = np.take(segs["ac"],
                                 np.arange(timesteps_per_actorbatch,
                                           len(segs["ac"])),
                                 axis=0)
            segs["rew"] = np.take(segs["rew"],
                                  np.arange(timesteps_per_actorbatch,
                                            len(segs["rew"])),
                                  axis=0)
            segs["vpred"] = np.take(segs["vpred"],
                                    np.arange(timesteps_per_actorbatch,
                                              len(segs["vpred"])),
                                    axis=0)
            segs["act_props"] = np.take(segs["act_props"],
                                        np.arange(timesteps_per_actorbatch,
                                                  len(segs["act_props"])),
                                        axis=0)
            segs["new"] = np.take(segs["new"],
                                  np.arange(timesteps_per_actorbatch,
                                            len(segs["new"])),
                                  axis=0)
            segs["adv"] = np.take(segs["adv"],
                                  np.arange(timesteps_per_actorbatch,
                                            len(segs["adv"])),
                                  axis=0)
            segs["tdlamret"] = np.take(segs["tdlamret"],
                                       np.arange(timesteps_per_actorbatch,
                                                 len(segs["tdlamret"])),
                                       axis=0)
            segs["ep_rets"] = np.take(segs["ep_rets"],
                                      np.arange(timesteps_per_actorbatch,
                                                len(segs["ep_rets"])),
                                      axis=0)
            segs["ep_lens"] = np.take(segs["ep_lens"],
                                      np.arange(timesteps_per_actorbatch,
                                                len(segs["ep_lens"])),
                                      axis=0)
            segs["v_target"] = np.take(segs["v_target"],
                                       np.arange(timesteps_per_actorbatch,
                                                 len(segs["v_target"])),
                                       axis=0)
            segs["ob"] = np.append(segs['ob'], seg['ob'], axis=0)
            segs["next_ob"] = np.append(segs['next_ob'],
                                        seg['next_ob'],
                                        axis=0)
            segs["ac"] = np.append(segs['ac'], seg['ac'], axis=0)
            segs["rew"] = np.append(segs['rew'], seg['rew'], axis=0)
            segs["vpred"] = np.append(segs['vpred'], seg['vpred'], axis=0)
            segs["act_props"] = np.append(segs['act_props'],
                                          seg['act_props'],
                                          axis=0)
            segs["new"] = np.append(segs['new'], seg['new'], axis=0)
            segs["adv"] = np.append(segs['adv'], seg['adv'], axis=0)
            segs["tdlamret"] = np.append(segs['tdlamret'],
                                         seg['tdlamret'],
                                         axis=0)
            segs["ep_rets"] = np.append(segs['ep_rets'],
                                        seg['ep_rets'],
                                        axis=0)
            segs["ep_lens"] = np.append(segs['ep_lens'],
                                        seg['ep_lens'],
                                        axis=0)
            segs["v_target"] = np.append(segs['v_target'],
                                         np.zeros(len(seg["ob"]), 'float32'),
                                         axis=0)
        else:
            segs["ob"] = np.append(segs['ob'], seg['ob'], axis=0)
            segs["next_ob"] = np.append(segs['next_ob'],
                                        seg['next_ob'],
                                        axis=0)
            segs["ac"] = np.append(segs['ac'], seg['ac'], axis=0)
            segs["rew"] = np.append(segs['rew'], seg['rew'], axis=0)
            segs["vpred"] = np.append(segs['vpred'], seg['vpred'], axis=0)
            segs["act_props"] = np.append(segs['act_props'],
                                          seg['act_props'],
                                          axis=0)
            segs["new"] = np.append(segs['new'], seg['new'], axis=0)
            segs["adv"] = np.append(segs['adv'], seg['adv'], axis=0)
            segs["tdlamret"] = np.append(segs['tdlamret'],
                                         seg['tdlamret'],
                                         axis=0)
            segs["ep_rets"] = np.append(segs['ep_rets'],
                                        seg['ep_rets'],
                                        axis=0)
            segs["ep_lens"] = np.append(segs['ep_lens'],
                                        seg['ep_lens'],
                                        axis=0)
            segs["v_target"] = np.append(segs['v_target'],
                                         np.zeros(len(seg["ob"]), 'float32'),
                                         axis=0)

        if iters_so_far == 0:
            ob, ac, tdlamret = seg["ob"], seg["ac"], seg["tdlamret"]
            d = Dataset(dict(ob=ob, ac=ac, vtarg=tdlamret),
                        shuffle=not pi.recurrent)
            optim_batchsize = optim_batchsize or ob.shape[0]

            # Train V function
            # logger.log("Catchup Training V Func and Evaluating V Func Losses")
            for _ in range(max_v_train_iter):
                for batch in d.iterate_once(optim_batchsize):
                    *vf_loss, g = vf_lossandgrad(batch["ob"], batch["ac"],
                                                 batch["vtarg"], cur_lrmult)
                    vf_adam.update(g, optim_stepsize * cur_lrmult)
                # logger.log(fmt_row(13, np.mean(vf_losses, axis = 0)))
        else:

            # Update v target
            new = segs["new"]
            rew = segs["rew"]
            act_prob = np.asarray(compute_a_prob(segs["ob"], segs["ac"])).T
            importance_ratio = np.squeeze(act_prob) / (
                segs["act_props"] + np.ones(segs["act_props"].shape) * 1e-8)
            segs["v_target"] = importance_ratio * (1 / np.sum(importance_ratio)) * \
                               np.squeeze(
                                   rew + np.invert(new).astype(np.float32) * gamma * compute_v_pred(segs["next_ob"]))
            # train_segs["v_target"] = rew + np.invert(new).astype(np.float32) * gamma * compute_v_pred(train_segs["next_ob"])
            if len(segs["ob"]) >= 20000:
                train_times = int(max_v_train_iter /
                                  2) if int(max_v_train_iter / 2) > 0 else 1
            else:
                train_times = 2
            for i in range(train_times):
                selected_train_index = np.random.choice(
                    range(len(segs["ob"])),
                    timesteps_per_actorbatch,
                    replace=False)
                train_segs["ob"] = np.take(segs["ob"],
                                           selected_train_index,
                                           axis=0)
                train_segs["next_ob"] = np.take(segs["next_ob"],
                                                selected_train_index,
                                                axis=0)
                train_segs["ac"] = np.take(segs["ac"],
                                           selected_train_index,
                                           axis=0)
                train_segs["rew"] = np.take(segs["rew"],
                                            selected_train_index,
                                            axis=0)
                train_segs["vpred"] = np.take(segs["vpred"],
                                              selected_train_index,
                                              axis=0)
                train_segs["new"] = np.take(segs["new"],
                                            selected_train_index,
                                            axis=0)
                train_segs["adv"] = np.take(segs["adv"],
                                            selected_train_index,
                                            axis=0)
                train_segs["tdlamret"] = np.take(segs["tdlamret"],
                                                 selected_train_index,
                                                 axis=0)
                train_segs["v_target"] = np.take(segs["v_target"],
                                                 selected_train_index,
                                                 axis=0)
                #
                ob, ac, v_target = train_segs["ob"], train_segs[
                    "ac"], train_segs["v_target"]
                d = Dataset(dict(ob=ob, ac=ac, vtarg=v_target),
                            shuffle=not pi.recurrent)
                optim_batchsize = optim_batchsize or ob.shape[0]

                # Train V function
                # logger.log("Training V Func and Evaluating V Func Losses")
                # Train V function
                # logger.log("Catchup Training V Func and Evaluating V Func Losses")
                # logger.log("Train V - "+str(_))
                for _ in range(max_v_train_iter):
                    for batch in d.iterate_once(optim_batchsize):
                        *vf_loss, g = vf_lossandgrad(batch["ob"], batch["ac"],
                                                     batch["vtarg"],
                                                     cur_lrmult)
                        vf_adam.update(g, optim_stepsize * cur_lrmult)
                    # logger.log(fmt_row(13, np.mean(vf_losses, axis = 0)))
                # seg['vpred'] = np.asarray(compute_v_pred(seg["ob"])).reshape(seg['vpred'].shape)
                # seg['nextvpred'] = seg['vpred'][-1] * (1 - seg["new"][-1])
                # add_vtarg_and_adv(seg, gamma, lam)

            ob, ac, atarg, v_target = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate
            d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=v_target),
                        shuffle=not pi.recurrent)
            optim_batchsize = optim_batchsize or ob.shape[0]
            # Local search
            for _ in range(optim_epochs):
                for batch in d.iterate_once(optim_batchsize):
                    *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                                batch["atarg"], batch["vtarg"],
                                                cur_lrmult, 1 / 4)
                    adam.update(g, optim_stepsize * cur_lrmult)

            # seg['vpred'] = np.asarray(compute_v_pred(seg["ob"])).reshape(seg['vpred'].shape)
            # seg['nextvpred'] = seg['vpred'][-1] * (1 - seg["new"][-1])
            # add_vtarg_and_adv(seg, gamma, lam)

        ob_po, ac_po, atarg_po, tdlamret_po = seg["ob"], seg["ac"], seg[
            "adv"], seg["tdlamret"]
        atarg_po = (atarg_po - atarg_po.mean()) / atarg_po.std(
        )  # standardized advantage function estimate

        # opt['CMA_cmean'] = cmean_adapted
        # assign_old_eq_new()  # set old parameter values to new parameter values
        for i in range(len(layer_var_list)):
            # CMAES Train Policy
            assign_backup_eq_new()  # backup current policy
            flatten_weights = layer_get_operate_list[i]()

            if len(indices) < len(layer_var_list):
                selected_index, init_weights = uniform_select(
                    flatten_weights,
                    0.5)  # 0.5 means 50% proportion of params are selected
                indices.append(selected_index)
            else:
                rand = np.random.uniform()
                # print("Random-Number:", rand)
                # print("Epsilon:", epsilon)
                if rand < epsilon:
                    selected_index, init_weights = uniform_select(
                        flatten_weights, 0.5)
                    indices.append(selected_index)
                    # logger.log("Random: select new weights")
                else:
                    selected_index = indices[i]
                    init_weights = np.take(flatten_weights, selected_index)
            es = cma.CMAEvolutionStrategy(init_weights, sigma_adapted, opt)
            while True:
                if es.countiter >= gensize:
                    # logger.log("Max generations for current layer")
                    break
                # logger.log("Iteration:" + str(iters_so_far) + " - sub-train Generation for Policy:" + str(es.countiter))
                # logger.log("Sigma=" + str(es.sigma))
                # solutions = es.ask(sigma_fac = max(cur_lrmult, 1e-8))
                solutions = es.ask()
                # solutions = [np.clip(solution, -5.0, 5.0).tolist() for solution in solutions]
                costs = []
                lens = []

                assign_backup_eq_new()  # backup current policy

                for id, solution in enumerate(solutions):
                    np.put(flatten_weights, selected_index, solution)
                    layer_set_operate_list[i](flatten_weights)
                    cost = compute_pol_losses(ob_po, ac_po, atarg_po,
                                              tdlamret_po, cur_lrmult,
                                              1 / 4 * (i + 1))
                    costs.append(cost[0])
                    assign_new_eq_backup()
                # Weights decay
                l2_decay = compute_weight_decay(0.01, solutions)
                costs += l2_decay
                costs, real_costs = fitness_rank(costs)
                # logger.log("real_costs:"+str(real_costs))
                # best_solution = np.copy(es.result[0])
                # best_fitness = -es.result[1]
                es.tell_real_seg(solutions=solutions,
                                 function_values=costs,
                                 real_f=real_costs,
                                 segs=None)
                # best_solution = np.copy(solutions[np.argmin(costs)])
                # best_fitness = -real_costs[np.argmin(costs)]
                best_solution = es.result[0]
                best_fitness = es.result[1]
                np.put(flatten_weights, selected_index, best_solution)
                layer_set_operate_list[i](flatten_weights)
                # logger.log("Update the layer")
                # best_solution = es.result[0]
                # best_fitness = es.result[1]
                # logger.log("Best Solution Fitness:" + str(best_fitness))
                # set_pi_flat_params(best_solution)
            import gc
            gc.collect()

        iters_so_far += 1
        episodes_so_far += sum(lens)
示例#18
0
def learn(
        *,
        network,
        env,
        total_timesteps,
        timesteps_per_batch=1024,  # what to train on
        max_kl=0.002,
        cg_iters=10,
        gamma=0.99,
        lam=1.0,  # advantage estimation
        seed=None,
        ent_coef=0.00,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        load_path=None,
        num_reward=1,
        **network_kwargs):
    '''
    learn a policy function with TRPO algorithm

    Parameters:
    ----------

    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets

    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class

    timesteps_per_batch     timesteps per gradient estimation batch

    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )

    ent_coef                coefficient of policy entropy term in the optimization objective

    cg_iters                number of iterations of conjugate gradient algorithm

    cg_damping              conjugate gradient damping

    vf_stepsize             learning rate for adam optimizer used to optimie value function loss

    vf_iters                number of iterations of value function optimization iterations per each policy optimization step

    total_timesteps           max number of timesteps

    max_episodes            max number of episodes

    max_iters               maximum number of policy optimization iterations

    callback                function to be called with (locals(), globals()) each policy optimization step

    load_path               str, path to load the model from (default: None, i.e. no model is loaded)

    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network

    Returns:
    -------

    learnt model

    '''

    if MPI is not None:
        nworkers = MPI.COMM_WORLD.Get_size()
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        nworkers = 1
        rank = 0

    cpus_per_worker = 1
    U.get_session(
        config=tf.ConfigProto(allow_soft_placement=True,
                              inter_op_parallelism_threads=cpus_per_worker,
                              intra_op_parallelism_threads=cpus_per_worker))

    set_global_seeds(seed)
    # 创建policy
    policy = build_policy(env,
                          network,
                          value_network='copy',
                          num_reward=num_reward,
                          **network_kwargs)

    process_dir = logger.get_dir()
    save_dir = process_dir.split(
        'Data')[-2] + 'log/l1/seed' + process_dir[-1] + '/'
    os.makedirs(save_dir, exist_ok=True)
    coe_save = []
    impro_save = []
    grad_save = []
    adj_save = []
    coe = np.ones((num_reward)) / num_reward

    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    #################################################################
    # ob ac ret atarg 都是 placeholder
    # ret atarg 此处应该是向量形式
    ob = observation_placeholder(ob_space)

    # 创建pi和oldpi
    with tf.variable_scope("pi"):
        pi = policy(observ_placeholder=ob)
    with tf.variable_scope("oldpi"):
        oldpi = policy(observ_placeholder=ob)

    # 每个reward都可以算一个atarg
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32,
                         shape=[None, num_reward])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    #此处的KL div和entropy与reward无关
    ##################################
    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    # entbonus 是entropy loss
    entbonus = ent_coef * meanent
    #################################

    ###########################################################
    # vferr 用来更新 v 网络
    vferr = tf.reduce_mean(tf.square(pi.vf - ret))
    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))
    # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    # optimgain 用来更新 policy 网络, 应该每个reward有一个
    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    ###########################################################
    dist = meankl

    # 定义要优化的变量和 V 网络 adam 优化器
    all_var_list = get_trainable_variables("pi")
    # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")

    vfadam = MpiAdam(vf_var_list)

    # 把变量展开成一个向量的类
    get_flat = U.GetFlat(var_list)

    # 这个类可以把一个向量分片赋值给var_list里的变量
    set_from_flat = U.SetFromFlat(var_list)
    # kl散度的梯度
    klgrads = tf.gradients(dist, var_list)

    ####################################################################
    # 拉直的向量
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")

    # 把拉直的向量重新分成很多向量
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    ####################################################################

    ####################################################################
    # 把kl散度梯度与变量乘积相加
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  #pylint: disable=E1111
    # 把gvp的梯度展成向量
    fvp = U.flatgrad(gvp, var_list)
    ####################################################################

    # 用学习后的策略更新old策略
    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables("oldpi"), get_variables("pi"))
        ])

    # 计算loss
    compute_losses = U.function([ob, ac, atarg], losses)
    # 计算loss和梯度
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    # 计算fvp
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    # 计算值网络的梯度
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        if MPI is not None:
            out = np.empty_like(x)
            MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
            out /= nworkers
        else:
            out = np.copy(x)

        return out

    # 初始化variable
    U.initialize()
    if load_path is not None:
        pi.load(load_path)

    # 得到初始化的参数向量
    th_init = get_flat()
    if MPI is not None:
        MPI.COMM_WORLD.Bcast(th_init, root=0)

    # 把向量the_init的值分片赋值给var_list
    set_from_flat(th_init)

    #同步
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------

    # 这是一个生成数据的迭代器
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     num_reward=num_reward)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()

    # 双端队列
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0:
        # noththing to be done
        return pi

    assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \
        'out of max_iters, total_timesteps, and max_episodes only one should be specified'

    while True:
        if callback: callback(locals(), globals())
        if total_timesteps and timesteps_so_far >= total_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()

        # 计算累积回报
        add_vtarg_and_adv(seg, gamma, lam, num_reward=num_reward)
        ###########$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ToDo
        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))

        # ob, ac, atarg, tdlamret 的类型都是ndarray
        #ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
        _, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        #print(seg['ob'].shape,type(seg['ob']))
        #print(seg['ac'],type(seg['ac']))
        #print(seg['adv'],type(seg['adv']))
        #print(seg["tdlamret"].shape,type(seg['tdlamret']))
        vpredbefore = seg["vpred"]  # predicted value function before udpate

        # 标准化
        #print("============================== atarg =========================================================")
        #print(atarg)
        atarg = (atarg - np.mean(atarg, axis=0)) / np.std(
            atarg, axis=0)  # standardized advantage function estimate
        #atarg = (atarg) / np.max(np.abs(atarg),axis=0)
        #print('======================================= standardized atarg ====================================')
        #print(atarg)
        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        ## set old parameter values to new parameter values
        assign_old_eq_new()

        G = None
        S = None
        mr_lossbefore = np.zeros((num_reward, len(loss_names)))
        grad_norm = np.zeros((num_reward + 1))
        for i in range(num_reward):
            args = seg["ob"], seg["ac"], atarg[:, i]
            #print(atarg[:,i])
            # 算是args的一个sample,每隔5个取出一个
            fvpargs = [arr[::5] for arr in args]

            # 这个函数计算fisher matrix 与向量 p 的 乘积
            def fisher_vector_product(p):
                return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

            with timed("computegrad of " + str(i + 1) + ".th reward"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            mr_lossbefore[i] = lossbefore
            g = allmean(g)
            #print("***************************************************************")
            #print(g)
            #print('==================='+str(i+1)+"=====================",np.linalg.norm(g))
            #print(atarg[:,i])
            if isinstance(G, np.ndarray):
                G = np.vstack((G, g))
            else:
                G = g

            # g是目标函数的梯度
            # 利用共轭梯度获得更新方向
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg of " + str(i + 1) + ".th reward"):
                    # stepdir 是更新方向
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=rank == 0)
                    shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                    lm = np.sqrt(shs / max_kl)
                    # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                    fullstep = stepdir / lm
                    #print(np.linalg.norm(fullstep))
                    grad_norm[i] = np.linalg.norm(fullstep)
                assert np.isfinite(stepdir).all()
                if isinstance(S, np.ndarray):
                    S = np.vstack((S, stepdir))
                else:
                    S = stepdir
        #print('======================================= G ====================================')
        #print(G)
        #print('======================================= S ====================================')
        #print(S)
        new_coe = get_coefficient(G, S)
        #coe = 0.99 * coe + 0.01 * new_coe
        coe = new_coe
        coe_save.append(coe)
        #根据梯度的夹角调整参数
        try:
            GG = np.dot(S, S.T)
            D = np.sqrt(np.diag(1 / np.diag(GG)))
            GG = np.dot(np.dot(D, GG), D)
            #print('======================================= inner product ====================================')
            #print(GG)
            adj = np.sum(GG) / (num_reward**2)
        except:
            adj = 1
        #print('======================================= adj ====================================')
        #print(adj)
        try:
            adj = 1
            adj_save.append(adj)
            adj_max_kl = adj * max_kl
            #################################################################
            grad_norm = grad_norm * np.sqrt(adj)
            stepdir = np.dot(coe, S)
            g = np.dot(coe, G)
            lossbefore = np.dot(coe, mr_lossbefore)
            #################################################################

            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / adj_max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            grad_norm[num_reward] = np.linalg.norm(fullstep)
            grad_save.append(grad_norm)
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()

            def compute_mr_losses():
                mr_losses = np.zeros((num_reward, len(loss_names)))
                for i in range(num_reward):
                    args = seg["ob"], seg["ac"], atarg[:, i]
                    one_reward_loss = allmean(np.array(compute_losses(*args)))
                    mr_losses[i] = one_reward_loss
                mr_loss = np.dot(coe, mr_losses)
                return mr_loss, mr_losses

            # 做10次搜索
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                mr_loss_new, mr_losses_new = compute_mr_losses()
                mr_impro = mr_losses_new - mr_lossbefore
                meanlosses = surr, kl, *_ = allmean(np.array(mr_loss_new))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f" %
                           (expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > adj_max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    impro_save.append(np.hstack((mr_impro[:, 0], improve)))
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather(
                    (thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                assert all(
                    np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

            for (lossname, lossval) in zip(loss_names, meanlosses):
                logger.record_tabular(lossname, lossval)

            with timed("vf"):
                #print('======================================= tdlamret ====================================')
                #print(seg["tdlamret"])
                for _ in range(vf_iters):
                    for (mbob, mbret) in dataset.iterbatches(
                        (seg["ob"], seg["tdlamret"]),
                            include_final_partial_batch=False,
                            batch_size=64):
                        #with tf.Session() as sess:
                        #    sess.run(tf.global_variables_initializer())
                        #    aaa = sess.run(pi.vf,feed_dict={ob:mbob,ret:mbret})
                        #    print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")
                        #    print(aaa.shape)
                        #    print(mbret.shape)
                        g = allmean(compute_vflossandgrad(mbob, mbret))
                        vfadam.update(g, vf_stepsize)
            #print(mbob,mbret)
        except:
            print('error')
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        if MPI is not None:
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        else:
            listoflrpairs = [lrlocal]

        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if rank == 0:
            logger.dump_tabular()
        #pdb.set_trace()
    np.save(save_dir + 'coe.npy', coe_save)
    np.save(save_dir + 'grad.npy', grad_save)
    np.save(save_dir + 'improve.npy', impro_save)
    np.save(save_dir + 'adj.npy', adj_save)
    return pi
示例#19
0
    def __init__(self, env, world, policies, nsteps, load_path, rho, max_kl,
                 ent_coef, vf_coef, max_grad_norm, sync):
        self.sess = sess = U.get_session()
        self.env = env
        self.world = world
        self.sync = sync
        self.max_kl = max_kl
        if hasattr(env, 'num_envs'):
            self.n_batches = n_batches = nsteps * env.num_envs
        else:
            self.n_batches = n_batches = nsteps

        if MPI is not None:
            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
        else:
            self.nworkers = 1
            self.rank = 0

        cpus_per_worker = 1
        U.get_session(config=tf.ConfigProto(
            allow_soft_placement=True,
            inter_op_parallelism_threads=cpus_per_worker,
            intra_op_parallelism_threads=cpus_per_worker))

        # GLOBAL PLACEHOLDERS
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        self.pi_n, self.oldpi_n, self.vfadam_n, self.exchange_n, self.to_exchange_n = [], [], [], [], []
        self.compute_jtvp_n, self.compute_fvp_n, self.compute_losses_n, self.compute_vfloss_n = [], [], [], []
        self.set_from_flat_n, self.get_flat_n = [], []
        for i in range(world.n):
            name_scope = world.agents[i].name.replace(' ', '')
            with tf.variable_scope(name_scope):
                # OBSERVATION PLACEHOLDER
                ob_dtype = env.observation_space[i].dtype
                ob_shape = env.observation_space[i].shape
                OB = tf.placeholder(dtype=ob_dtype, shape=(None, ) + ob_shape)
                # Policy
                with tf.variable_scope("pi"):
                    pi = policies[i](n_batches, observ_placeholder=OB)
                with tf.variable_scope("oldpi"):
                    oldpi = policies[i](n_batches, observ_placeholder=OB)

                # CREATE OTHER PLACEHOLDERS
                AC = pi.pdtype.sample_placeholder([None])
                ADV = tf.placeholder(dtype=tf.float32, shape=[None])
                R = tf.placeholder(dtype=tf.float32, shape=[None])
                OLDVPRED = tf.placeholder(dtype=tf.float32, shape=[None])
                NB = tf.placeholder(dtype=tf.int32, shape=None)
                A = tf.placeholder(dtype=tf.float32, shape=None)

                ratio = tf.exp(
                    pi.pd.logp(AC) - oldpi.pd.logp(AC)
                )  # Be careful about the dimensionality!!!!!!!!!!!!!!!!
                surrgain = tf.reduce_mean(ADV * ratio)
                kloldnew = oldpi.pd.kl(pi.pd)
                meankl = tf.reduce_mean(kloldnew)
                sync_err = A * tf.reshape(ratio,
                                          (self.n_batches, )) - tf.reshape(
                                              tf.gather(pi.net.z, NB),
                                              (self.n_batches, ))
                sync_loss = tf.reduce_sum(tf.reshape(tf.gather(pi.net.z, NB), (self.n_batches,)) * sync_err) + \
                            0.5 * rho * tf.reduce_sum(tf.square(sync_err))
                lagrange_loss = -surrgain + sync_loss
                losses = [lagrange_loss, surrgain, meankl]
                dist = meankl

                var_list = pi.net.w
                klgrads = tf.gradients(dist, var_list)
                flat_tangent = tf.placeholder(dtype=tf.float32,
                                              shape=[None],
                                              name="flat_tan")

                shapes = [var.get_shape().as_list() for var in var_list]
                start = 0
                tangents = []
                for shape in shapes:
                    sz = U.intprod(shape)
                    tangents.append(
                        tf.reshape(flat_tangent[start:start + sz], shape))
                    start += sz

                jjvp = [tf.zeros(shape, dtype=tf.float32) for shape in shapes]
                jtvp = [tf.zeros(shape, dtype=tf.float32) for shape in shapes]
                right_b = -ADV + A * tf.gather(
                    pi.net.p, NB) - rho * A * tf.gather(pi.net.z, NB)
                for i in range(self.n_batches):
                    ratio_i_grad = tf.gradients(ratio[i], var_list)
                    jvp_i = tf.add_n([
                        tf.reduce_sum(g * tangent)
                        for (g, tangent) in zipsame(ratio_i_grad, tangents)
                    ])
                    jjvp = [
                        tf.add_n([jj, gg * jvp_i])
                        for (jj, gg) in zipsame(jjvp, ratio_i_grad)
                    ]
                    jtvp = [
                        tf.add_n([jt, gt * right_b[i]])
                        for (jt, gt) in zipsame(jtvp, ratio_i_grad)
                    ]
                    print(i)

                jjvp = tf.concat(
                    axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in jjvp])
                jtvp = tf.concat(
                    axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in jtvp])
                gvp = tf.add_n([
                    tf.reduce_sum(g * tangent)
                    for (g, tangent) in zipsame(klgrads, tangents)
                ])  #pylint: disable=E1111
                fvp = tf.add_n([U.flatgrad(gvp, var_list), rho * jjvp])

                # Define the value loss
                vpredclipped = OLDVPRED + tf.clip_by_value(
                    pi.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE)
                # vpredclipped = tf.clip_by_value(pi.vf, OLDVPRED*(1-CLIPRANGE), OLDVPRED*(1+CLIPRANGE))
                vferr = tf.square(pi.vf - R)
                vferr2 = tf.square(vpredclipped - R)
                vf_loss = .5 * tf.reduce_mean(tf.maximum(vferr, vferr2))
                vfadam = MpiAdam(pi.net.v)

                compute_jtvp = U.function([OB, AC, ADV, A, NB], jtvp)
                compute_fvp = U.function([flat_tangent, OB, AC, ADV], fvp)
                compute_losses = U.function([OB, AC, ADV, A, NB], losses)
                compute_vfloss = U.function([OB, R, OLDVPRED, CLIPRANGE],
                                            vf_loss)
                exchange = pi.net.exchange(sess, OB, AC, CLIPRANGE, NB, rho)
                to_exchange = U.function(
                    [OB, AC, ADV, NB, CLIPRANGE],
                    [ratio, tf.gather(pi.net.p, NB)])

                get_flat = U.GetFlat(var_list)
                set_from_flat = U.SetFromFlat(var_list)

            self.pi_n.append(pi)
            self.oldpi_n.append(oldpi)
            self.get_flat_n.append(get_flat)
            self.set_from_flat_n.append(set_from_flat)
            self.vfadam_n.append(vfadam)
            self.exchange_n.append(exchange)
            self.to_exchange_n.append(to_exchange)
            self.compute_jtvp_n.append(compute_jtvp)
            self.compute_fvp_n.append(compute_fvp)
            self.compute_losses_n.append(compute_losses)
            self.compute_vfloss_n.append(compute_vfloss)

        # Update old plicy network
        updates = []
        for i in range(len(world.agents)):
            name_scope = world.agents[i].name.replace(' ', '')
            old_vars = get_trainable_variables("{}/oldpi".format(name_scope))
            now_vars = get_trainable_variables("{}/pi".format(name_scope))
            updates += [
                tf.assign(oldv, nowv)
                for (oldv, nowv) in zipsame(old_vars, now_vars)
            ]
            updates += [
                tf.assign(self.pi_n[i].net.z, tf.ones_like(self.pi_n[i].net.z))
            ]
        self.assign_old_eq_new = U.function([], [], updates=updates)

        @contextmanager
        def timed(msg):
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))

        self.timed = timed

        def allmean(x):
            assert isinstance(x, np.ndarray)
            if MPI is not None:
                out = np.empty_like(x)
                MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
                out /= self.nworkers
            else:
                out = np.copy(x)

            return out

        self.allmean = allmean

        # Initialization
        U.initialize()
        if load_path is not None:
            self.load(load_path)

            # for i in range(len(self.pi_n)):
            th_init = self.get_flat_n[i]()
            self.set_from_flat_n[i](th_init)
            print("Init param sum", th_init.sum(), flush=True)

        for vfadam in self.vfadam_n:
            vfadam.sync()
示例#20
0
    def __init__(self, timed, policy, ob_space, ac_space, max_kl=0.001, cg_iters=10,
        ent_coef=0.0,cg_damping=1e-2,vf_stepsize=3e-4,vf_iters =3,load_path=None, num_reward=1, index=1):
        if MPI is not None:
            nworkers = MPI.COMM_WORLD.Get_size()
            rank = MPI.COMM_WORLD.Get_rank()
        else:
            nworkers = 1
            rank = 0

        cpus_per_worker = 1
        U.get_session(config=tf.ConfigProto(
                allow_soft_placement=True,
                inter_op_parallelism_threads=cpus_per_worker,
                intra_op_parallelism_threads=cpus_per_worker
        ))
        
        #################################################################
        # ob ac ret atarg 都是 placeholder
        # ret atarg 此处应该是向量形式
        ob = observation_placeholder(ob_space)

        # 创建pi和oldpi
        with tf.variable_scope(str(index)+"pi"):
            pi = policy(observ_placeholder=ob)
        with tf.variable_scope(str(index)+"oldpi"):
            oldpi = policy(observ_placeholder=ob)
        
        # 每个reward都可以算一个atarg
        atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
        ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

        ac = pi.pdtype.sample_placeholder([None])
        
        #此处的KL div和entropy与reward无关
        ##################################
        kloldnew = oldpi.pd.kl(pi.pd)
        ent = pi.pd.entropy()
        meankl = tf.reduce_mean(kloldnew)
        meanent = tf.reduce_mean(ent)
        # entbonus 是entropy loss
        entbonus = ent_coef * meanent
        #################################
        
        ###########################################################
        # vferr 用来更新 v 网络
        vferr = tf.reduce_mean(tf.square(pi.vf - ret))
        ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) 
        # advantage * pnew / pold
        surrgain = tf.reduce_mean(ratio * atarg)

        # optimgain 用来更新 policy 网络, 应该每个reward有一个
        optimgain = surrgain + entbonus
        losses = [optimgain, meankl, entbonus, surrgain, meanent]
        loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]
       
        ###########################################################
        dist = meankl
        
        # 定义要优化的变量和 V 网络 adam 优化器
        all_var_list = get_trainable_variables(str(index)+"pi")
        # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
        # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
        var_list = get_pi_trainable_variables(str(index)+"pi")
        vf_var_list = get_vf_trainable_variables(str(index)+"pi")

        vfadam = MpiAdam(vf_var_list)

        # 把变量展开成一个向量的类
        get_flat = U.GetFlat(var_list)

        # 这个类可以把一个向量分片赋值给var_list里的变量
        set_from_flat = U.SetFromFlat(var_list)
        # kl散度的梯度
        klgrads = tf.gradients(dist, var_list)
        
        ####################################################################
        # 拉直的向量
        flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")

        # 把拉直的向量重新分成很多向量
        shapes = [var.get_shape().as_list() for var in var_list]
        start = 0
        tangents = []
        for shape in shapes:
            sz = U.intprod(shape)
            tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
            start += sz
        ####################################################################
        
        ####################################################################
        # 把kl散度梯度与变量乘积相加
        gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111
        # 把gvp的梯度展成向量
        fvp = U.flatgrad(gvp, var_list)
        ####################################################################

        # 用学习后的策略更新old策略
        assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
            for (oldv, newv) in zipsame(get_variables(str(index)+"oldpi"), get_variables(str(index)+"pi"))])
        
        
        # 计算loss
        compute_losses = U.function([ob, ac, atarg], losses)
        # 计算loss和梯度
        compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
        # 计算fvp
        compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
        # 计算值网络的梯度
        compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))
        

        # 初始化variable
        U.initialize()
        if load_path is not None:
            pi.load(load_path)
        
        # 得到初始化的参数向量
        th_init = get_flat()
        if MPI is not None:
            MPI.COMM_WORLD.Bcast(th_init, root=0)
        
        # 把向量the_init的值分片赋值给var_list
        set_from_flat(th_init)

        #同步
        vfadam.sync()
        print("Init param sum", th_init.sum(), flush=True)


        self.MPI = MPI
        self.pi = pi
        self.oldpi = oldpi

        self.compute_losses = compute_losses
        self.compute_lossandgrad = compute_lossandgrad
        self.compute_fvp = compute_fvp
        self.compute_vflossandgrad = compute_vflossandgrad

        self.assign_old_eq_new = assign_old_eq_new
        self.get_flat = get_flat
        self.set_from_flat = set_from_flat
        self.vfadam = vfadam
        # params
        self.max_kl = max_kl
        self.cg_iters = cg_iters
        self.ent_coef = ent_coef
        self.cg_damping = cg_damping
        self.vf_stepsize = vf_stepsize
        self.vf_iters = vf_iters
        
        self.rank = rank
        self.index = index
        self.timed = timed
def learn(
        *,
        network,
        env,
        eval_env,
        make_eval_env,
        env_id,
        total_timesteps,
        timesteps_per_batch,
        sil_update,
        sil_loss,  # what to train on
        max_kl=0.001,
        cg_iters=10,
        gamma=0.99,
        lam=1.0,  # advantage estimation
        seed=None,
        ent_coef=0.0,
        lr=3e-4,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=5,
        sil_value=0.01,
        sil_alpha=0.6,
        sil_beta=0.1,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        save_interval=0,
        load_path=None,
        # MBL
        # For train mbl
        mbl_train_freq=5,

        # For eval
        num_eval_episodes=5,
        eval_freq=5,
        vis_eval=False,
        eval_targs=('mbmf', ),
        #eval_targs=('mf',),
        quant=2,

        # For mbl.step
        #num_samples=(1500,),
        num_samples=(1, ),
        horizon=(2, ),
        #horizon=(2,1),
        #num_elites=(10,),
        num_elites=(1, ),
        mbl_lamb=(1.0, ),
        mbl_gamma=0.99,
        #mbl_sh=1, # Number of step for stochastic sampling
        mbl_sh=10000,
        #vf_lookahead=-1,
        #use_max_vf=False,
        reset_per_step=(0, ),

        # For get_model
        num_fc=2,
        num_fwd_hidden=500,
        use_layer_norm=False,

        # For MBL
        num_warm_start=int(1e4),
        init_epochs=10,
        update_epochs=5,
        batch_size=512,
        update_with_validation=False,
        use_mean_elites=1,
        use_ent_adjust=0,
        adj_std_scale=0.5,

        # For data loading
        validation_set_path=None,

        # For data collect
        collect_val_data=False,

        # For traj collect
        traj_collect='mf',

        # For profile
        measure_time=True,
        eval_val_err=False,
        measure_rew=True,
        model_fn=None,
        update_fn=None,
        init_fn=None,
        mpi_rank_weight=1,
        comm=None,
        vf_coef=0.5,
        max_grad_norm=0.5,
        log_interval=1,
        nminibatches=4,
        noptepochs=4,
        cliprange=0.2,
        **network_kwargs):
    '''
    learn a policy function with TRPO algorithm

    Parameters:
    ----------

    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets

    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class

    timesteps_per_batch     timesteps per gradient estimation batch

    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )

    ent_coef                coefficient of policy entropy term in the optimization objective

    cg_iters                number of iterations of conjugate gradient algorithm

    cg_damping              conjugate gradient damping

    vf_stepsize             learning rate for adam optimizer used to optimie value function loss

    vf_iters                number of iterations of value function optimization iterations per each policy optimization step

    total_timesteps           max number of timesteps

    max_episodes            max number of episodes

    max_iters               maximum number of policy optimization iterations

    callback                function to be called with (locals(), globals()) each policy optimization step

    load_path               str, path to load the model from (default: None, i.e. no model is loaded)

    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network

    Returns:
    -------

    learnt model

    '''
    if not isinstance(num_samples, tuple): num_samples = (num_samples, )
    if not isinstance(horizon, tuple): horizon = (horizon, )
    if not isinstance(num_elites, tuple): num_elites = (num_elites, )
    if not isinstance(mbl_lamb, tuple): mbl_lamb = (mbl_lamb, )
    if not isinstance(reset_per_step, tuple):
        reset_per_step = (reset_per_step, )
    if validation_set_path is None:
        if collect_val_data:
            validation_set_path = os.path.join(logger.get_dir(), 'val.pkl')
        else:
            validation_set_path = os.path.join('dataset',
                                               '{}-val.pkl'.format(env_id))
    if eval_val_err:
        eval_val_err_path = os.path.join('dataset',
                                         '{}-combine-val.pkl'.format(env_id))
    logger.log(locals())
    logger.log('MBL_SH', mbl_sh)
    logger.log('Traj_collect', traj_collect)

    if MPI is not None:
        nworkers = MPI.COMM_WORLD.Get_size()
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        nworkers = 1
        rank = 0
    cpus_per_worker = 1
    U.get_session(
        config=tf.ConfigProto(allow_soft_placement=True,
                              inter_op_parallelism_threads=cpus_per_worker,
                              intra_op_parallelism_threads=cpus_per_worker))

    set_global_seeds(seed)
    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)

    policy = build_policy(env, network, value_network='copy', **network_kwargs)
    nenvs = env.num_envs
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * timesteps_per_batch
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

    ob = observation_placeholder(ob_space)
    with tf.variable_scope("pi"):
        pi = policy(observ_placeholder=ob)
        make_model = lambda: Model(
            policy=policy,
            ob_space=ob_space,
            ac_space=ac_space,
            nbatch_act=nenvs,
            nbatch_train=nbatch_train,
            nsteps=timesteps_per_batch,
            ent_coef=ent_coef,
            vf_coef=vf_coef,
            max_grad_norm=max_grad_norm,
            sil_update=sil_update,
            sil_value=sil_value,
            sil_alpha=sil_alpha,
            sil_beta=sil_beta,
            sil_loss=sil_loss,
            #                                    fn_reward=env.process_reward,
            fn_reward=None,
            #                                    fn_obs=env.process_obs,
            fn_obs=None,
            ppo=False,
            prev_pi='pi',
            silm=pi)
        model = make_model()
    with tf.variable_scope("oldpi"):
        oldpi = policy(observ_placeholder=ob)
        make_old_model = lambda: Model(
            policy=policy,
            ob_space=ob_space,
            ac_space=ac_space,
            nbatch_act=nenvs,
            nbatch_train=nbatch_train,
            nsteps=timesteps_per_batch,
            ent_coef=ent_coef,
            vf_coef=vf_coef,
            max_grad_norm=max_grad_norm,
            sil_update=sil_update,
            sil_value=sil_value,
            sil_alpha=sil_alpha,
            sil_beta=sil_beta,
            sil_loss=sil_loss,
            #                                    fn_reward=env.process_reward,
            fn_reward=None,
            #                                    fn_obs=env.process_obs,
            fn_obs=None,
            ppo=False,
            prev_pi='oldpi',
            silm=oldpi)
        old_model = make_old_model()

    # MBL
    # ---------------------------------------
    #viz = Visdom(env=env_id)
    win = None
    eval_targs = list(eval_targs)
    logger.log(eval_targs)

    make_model_f = get_make_mlp_model(num_fc=num_fc,
                                      num_fwd_hidden=num_fwd_hidden,
                                      layer_norm=use_layer_norm)
    mbl = MBL(env=eval_env,
              env_id=env_id,
              make_model=make_model_f,
              num_warm_start=num_warm_start,
              init_epochs=init_epochs,
              update_epochs=update_epochs,
              batch_size=batch_size,
              **network_kwargs)

    val_dataset = {'ob': None, 'ac': None, 'ob_next': None}
    if update_with_validation:
        logger.log('Update with validation')
        val_dataset = load_val_data(validation_set_path)
    if eval_val_err:
        logger.log('Log val error')
        eval_val_dataset = load_val_data(eval_val_err_path)
    if collect_val_data:
        logger.log('Collect validation data')
        val_dataset_collect = []

    def _mf_pi(ob, t=None):
        stochastic = True
        ac, vpred, _, _ = pi.step(ob, stochastic=stochastic)
        return ac, vpred

    def _mf_det_pi(ob, t=None):
        #ac, vpred, _, _ = pi.step(ob, stochastic=False)
        ac, vpred = pi._evaluate([pi.pd.mode(), pi.vf], ob)
        return ac, vpred

    def _mf_ent_pi(ob, t=None):
        mean, std, vpred = pi._evaluate([pi.pd.mode(), pi.pd.std, pi.vf], ob)
        ac = np.random.normal(mean, std * adj_std_scale, size=mean.shape)
        return ac, vpred
################### use_ent_adjust======> adj_std_scale????????pi action sample

    def _mbmf_inner_pi(ob, t=0):
        if use_ent_adjust:
            return _mf_ent_pi(ob)
        else:
            #return _mf_pi(ob)
            if t < mbl_sh: return _mf_pi(ob)
            else: return _mf_det_pi(ob)

    # ---------------------------------------

    # Run multiple configuration once
    all_eval_descs = []

    def make_mbmf_pi(n, h, e, l):
        def _mbmf_pi(ob):
            ac, rew = mbl.step(ob=ob,
                               pi=_mbmf_inner_pi,
                               horizon=h,
                               num_samples=n,
                               num_elites=e,
                               gamma=mbl_gamma,
                               lamb=l,
                               use_mean_elites=use_mean_elites)
            return ac[None], rew

        return Policy(step=_mbmf_pi, reset=None)

    for n in num_samples:
        for h in horizon:
            for l in mbl_lamb:
                for e in num_elites:
                    if 'mbmf' in eval_targs:
                        all_eval_descs.append(('MeanRew', 'MBL_TRPO_SIL',
                                               make_mbmf_pi(n, h, e, l)))
                    #if 'mbmf' in eval_targs: all_eval_descs.append(('MeanRew-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), 'MBL_TRPO-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), make_mbmf_pi(n, h, e, l)))
    if 'mf' in eval_targs:
        all_eval_descs.append(
            ('MeanRew', 'TRPO_SIL', Policy(step=_mf_pi, reset=None)))

    logger.log('List of evaluation targets')
    for it in all_eval_descs:
        logger.log(it[0])

    pool = Pool(mp.cpu_count())
    warm_start_done = False
    # ----------------------------------------

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = ent_coef * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = get_trainable_variables("pi")
    # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables("oldpi"), get_variables("pi"))
        ])

    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    if load_path is not None:
        pi.load(load_path)

    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)
    # Prepare for rollouts
    # ----------------------------------------
    if traj_collect == 'mf':
        seg_gen = traj_segment_generator(env,
                                         timesteps_per_batch,
                                         model,
                                         stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0:
        # noththing to be done
        return pi

    assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \
        'out of max_iters, total_timesteps, and max_episodes only one should be specified'

    while True:
        if callback: callback(locals(), globals())
        if total_timesteps and timesteps_so_far >= total_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()
            if traj_collect == 'mf-random' or traj_collect == 'mf-mb':
                seg_mbl = seg_gen_mbl.__next__()
            else:
                seg_mbl = seg
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]

        # Val data collection
        if collect_val_data:
            for ob_, ac_, ob_next_ in zip(ob[:-1, 0, ...], ac[:-1, ...],
                                          ob[1:, 0, ...]):
                val_dataset_collect.append(
                    (copy.copy(ob_), copy.copy(ac_), copy.copy(ob_next_)))
        # -----------------------------
        # MBL update
        else:
            ob_mbl, ac_mbl = seg_mbl["ob"], seg_mbl["ac"]

            mbl.add_data_batch(ob_mbl[:-1, 0, ...], ac_mbl[:-1, ...],
                               ob_mbl[1:, 0, ...])
            mbl.update_forward_dynamic(require_update=iters_so_far %
                                       mbl_train_freq == 0,
                                       ob_val=val_dataset['ob'],
                                       ac_val=val_dataset['ac'],
                                       ob_next_val=val_dataset['ob_next'])
        # -----------------------------

        if traj_collect == 'mf':
            #if traj_collect == 'mf' or traj_collect == 'mf-random' or traj_collect == 'mf-mb':
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            model = seg["model"]
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
            if hasattr(pi, "rms"):
                pi.rms.update(ob)  # update running mean/std for policy

            args = seg["ob"], seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            def fisher_vector_product(p):
                return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=rank == 0)
                assert np.isfinite(stepdir).all()
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / max_kl)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(
                        np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" %
                               (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > max_kl * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather(
                        (thnew.sum(),
                         vfadam.getflat().sum()))  # list of tuples
                    assert all(
                        np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

            for (lossname, lossval) in zip(loss_names, meanlosses):
                logger.record_tabular(lossname, lossval)

            with timed("vf"):

                for _ in range(vf_iters):
                    for (mbob, mbret) in dataset.iterbatches(
                        (seg["ob"], seg["tdlamret"]),
                            include_final_partial_batch=False,
                            batch_size=64):
                        g = allmean(compute_vflossandgrad(mbob, mbret))
                        vfadam.update(g, vf_stepsize)
            with timed("SIL"):
                lrnow = lr(1.0 - timesteps_so_far / total_timesteps)
                l_loss, sil_adv, sil_samples, sil_nlogp = model.sil_train(
                    lrnow)

            logger.record_tabular("ev_tdlam_before",
                                  explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        if MPI is not None:
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        else:
            listoflrpairs = [lrlocal]
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if sil_update > 0:
            logger.record_tabular("SilSamples", sil_samples)

        if rank == 0:
            # MBL evaluation
            if not collect_val_data:
                #set_global_seeds(seed)
                default_sess = tf.get_default_session()

                def multithread_eval_policy(env_, pi_, num_episodes_,
                                            vis_eval_, seed):
                    with default_sess.as_default():
                        if hasattr(env, 'ob_rms') and hasattr(env_, 'ob_rms'):
                            env_.ob_rms = env.ob_rms
                        res = eval_policy(env_, pi_, num_episodes_, vis_eval_,
                                          seed, measure_time, measure_rew)

                        try:
                            env_.close()
                        except:
                            pass
                    return res

                if mbl.is_warm_start_done() and iters_so_far % eval_freq == 0:
                    warm_start_done = mbl.is_warm_start_done()
                    if num_eval_episodes > 0:
                        targs_names = {}
                        with timed('eval'):
                            num_descs = len(all_eval_descs)
                            list_field_names = [e[0] for e in all_eval_descs]
                            list_legend_names = [e[1] for e in all_eval_descs]
                            list_pis = [e[2] for e in all_eval_descs]
                            list_eval_envs = [
                                make_eval_env() for _ in range(num_descs)
                            ]
                            list_seed = [seed for _ in range(num_descs)]
                            list_num_eval_episodes = [
                                num_eval_episodes for _ in range(num_descs)
                            ]
                            print(list_field_names)
                            print(list_legend_names)

                            list_vis_eval = [
                                vis_eval for _ in range(num_descs)
                            ]

                            for i in range(num_descs):
                                field_name, legend_name = list_field_names[
                                    i], list_legend_names[i],

                                res = multithread_eval_policy(
                                    list_eval_envs[i], list_pis[i],
                                    list_num_eval_episodes[i],
                                    list_vis_eval[i], seed)
                                #eval_results = pool.starmap(multithread_eval_policy, zip(list_eval_envs, list_pis, list_num_eval_episodes, list_vis_eval,list_seed))

                                #for field_name, legend_name, res in zip(list_field_names, list_legend_names, eval_results):
                                perf, elapsed_time, eval_rew = res
                                logger.record_tabular(field_name, perf)
                                if measure_time:
                                    logger.record_tabular(
                                        'Time-%s' % (field_name), elapsed_time)
                                if measure_rew:
                                    logger.record_tabular(
                                        'SimRew-%s' % (field_name), eval_rew)
                                targs_names[field_name] = legend_name

                    if eval_val_err:
                        fwd_dynamics_err = mbl.eval_forward_dynamic(
                            obs=eval_val_dataset['ob'],
                            acs=eval_val_dataset['ac'],
                            obs_next=eval_val_dataset['ob_next'])
                        logger.record_tabular('FwdValError', fwd_dynamics_err)

                    logger.dump_tabular()
                    #print(logger.get_dir())
                    #print(targs_names)
                    #if num_eval_episodes > 0:


#                        win = plot(viz, win, logger.get_dir(), targs_names=targs_names, quant=quant, opt='best')
# -----------
#logger.dump_tabular()
        yield pi

    if collect_val_data:
        with open(validation_set_path, 'wb') as f:
            pickle.dump(val_dataset_collect, f)
        logger.log('Save {} validation data'.format(len(val_dataset_collect)))
示例#22
0
def learn(env, policy_func, med_func, expert_dataset, pretrained, pretrained_weight, g_step, m_step, e_step, inner_iters, save_per_iter,
          ckpt_dir, log_dir, timesteps_per_batch, task_name, max_kl=0.01, max_timesteps=0, max_episodes=0, max_iters=0,
          batch_size=64, med_stepsize=1e-3, pi_stepsize=1e-3, callback=None, writer=None):
    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None))
    oldpi = policy_func("oldpi", ob_space, ac_space)
    med = med_func("mediator", ob_space, ac_space)
    pi_var_list = pi.get_trainable_variables()
    med_var_list = med.get_trainable_variables()
    g_ob = U.get_placeholder(name="g_ob", dtype=tf.float32, shape=[None] + list(ob_space.shape))
    g_ac = U.get_placeholder(name='g_ac', dtype=tf.float32, shape=[None] + list(ac_space.shape))
    e_ob = U.get_placeholder(name='e_ob', dtype=tf.float32, shape=[None] + list(ob_space.shape))
    e_ac = U.get_placeholder(name='e_ac', dtype=tf.float32, shape=[None] + list(ac_space.shape))
    med_loss = -tf.reduce_mean(med.g_pd.logp(g_ac) + med.e_pd.logp(e_ac)) * 0.5
    #pi_loss = -0.5 * (tf.reduce_mean(pi.pd.logp(ac) - med.pd.logp(ac)))
    g_pdf = tfd.MultivariateNormalDiag(loc=pi.pd.mean, scale_diag=pi.pd.std)
    m_pdf = tfd.MultivariateNormalDiag(loc=med.g_pd.mean, scale_diag=med.g_pd.std)
    pi_loss = tf.reduce_mean(g_pdf.cross_entropy(m_pdf) - g_pdf.entropy())  # tf.reduce_mean(pi.pd.kl(med.pd))
    kloldnew = oldpi.pd.kl(pi.pd)
    meankl = tf.reduce_mean(kloldnew)
    dist = meankl
    expert_loss = -tf.reduce_mean(pi.pd.logp(e_ac))


    assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])

    compute_med_loss = U.function([g_ob, g_ac, e_ob, e_ac], med_loss)
    compute_pi_loss = U.function([g_ob], pi_loss)
    compute_exp_loss = U.function([e_ob, e_ac], expert_loss)
    # compute_kl_loss = U.function([ob], dist)
    # compute_fvp = U.function([flat_tangent, ob, ac], fvp)
    compute_med_lossandgrad = U.function([g_ob, g_ac, e_ob, e_ac], [med_loss, U.flatgrad(med_loss, med_var_list)])
    compute_pi_lossandgrad = U.function([g_ob],  [pi_loss, U.flatgrad(pi_loss, pi_var_list)])
    compute_exp_lossandgrad = U.function([e_ob, e_ac], [expert_loss, U.flatgrad(expert_loss, pi_var_list)])
    get_flat = U.GetFlat(pi_var_list)
    set_from_flat = U.SetFromFlat(pi_var_list)
    med_adam = MpiAdam(med_var_list)
    pi_adam = MpiAdam(pi_var_list)

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    # th_init = get_flat()
    # MPI.COMM_WORLD.Bcast(th_init, root=0)
    # set_from_flat(th_init)

    med_adam.sync()
    pi_adam.sync()
    # if rank == 0:
    #     print("Init pi param sum %d, init med param sum %d." % (th_pi_init.sum(), th_med_init.sum()), flush=True)

    seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    true_rewbuffer = deque(maxlen=40)

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1
    loss_stats = stats(["med_loss", "pi_loss"])
    ep_stats = stats(["True_rewards", "Episode_length"])

    if pretrained_weight is not None:
        U.load_state(pretrained_weight, var_list=pi_var_list)

    med_losses = []
    pi_losses = []

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break

        # Save model
        if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            fname = os.path.join(ckpt_dir, task_name)
            os.makedirs(os.path.dirname(fname), exist_ok=True)
            saver = tf.train.Saver()
            saver.save(tf.get_default_session(), fname)

        logger.log("********** Iteration %i ************" % iters_so_far)
        # ======= Optimize Mediator=========

        seg = seg_gen.__next__()
        g_ob, g_ac = seg['ob'], seg['ac']
        #assign_old_eq_new()
        #stepsize = 3e-4
        # thbefore = get_flat()
        d = dataset.Dataset(dict(ob=g_ob, ac=g_ac))
        optim_batchsize = min(batch_size, len(g_ob))
        g_loss = []
        logger.log("Optimizing Generator...")
        for _ in range(1):
            g_batch = d.next_batch(optim_batchsize)
            g_batch_ob, g_batch_ac = g_batch['ob'], g_batch['ac']
            if hasattr(pi, "obs_rms"):
                pi.obs_rms.update(g_batch_ob)
            pi_loss, g = compute_pi_lossandgrad(g_batch_ob)
            # kl = compute_kl_loss(g_ob)
            # if kl > max_kl * 1.5:
            #     logger.log("violated KL constraint. Shrinking step.")
            #     # stepsize *= 0.1
            #     break
            # else:
            #     logger.log("Stepsize OK!")
            pi_adam.update(allmean(g), pi_stepsize)
            g_loss.append(pi_loss)
        pi_losses.append(np.mean(np.array(g_loss)))
        med_loss = []
        logger.log("Optimizing Mediator...")
        for g_ob_batch, g_ac_batch in dataset.iterbatches((seg['ob'], seg['ac']), include_final_partial_batch=False, batch_size=batch_size):
            # g_batch = d.next_batch(optim_batchsize)
            # g_ob_batch, g_ac_batch = g_batch['ob'], g_batch['ac']
            e_ob_batch, e_ac_batch = expert_dataset.get_next_batch(optim_batchsize)
            if hasattr(med, "obs_rms"):
                med.obs_rms.update(np.concatenate((g_ob_batch, e_ob_batch), 0))

            newlosses, g = compute_med_lossandgrad(g_ob_batch, g_ac_batch, e_ob_batch, e_ac_batch)
            med_adam.update(allmean(g), med_stepsize)
            med_loss.append(newlosses)
        med_losses.append(np.mean(np.array(med_loss)))
        #logger.record_tabular("med_loss_each_iter", np.mean(np.array(med_losses)))



            #logger.record_tabular("gen_loss_each_iter", np.mean(np.array(pi_losses)))
        #logger.record_tabular("expert_loss_each_iter", np.mean(np.array(exp_losses)))
        logger.record_tabular("med_loss_each_iter", np.mean(np.array(med_losses)))
        logger.record_tabular("gen_loss_each_iter", np.mean(np.array(pi_losses)))

        lrlocal = (seg["ep_lens"], seg["ep_true_rets"])
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)
        lens, true_rets = map(flatten_lists, zip(*listoflrpairs))
        true_rewbuffer.extend(true_rets)
        lenbuffer.extend(lens)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if writer is not None:
            loss_stats.add_all_summary(writer, [np.mean(np.array(med_losses)), np.mean(np.array(pi_losses))], episodes_so_far)
            ep_stats.add_all_summary(writer, [np.mean(true_rewbuffer), np.mean(lenbuffer)], episodes_so_far)
        if rank == 0:
            logger.dump_tabular()
示例#23
0
def learn(
    base_env,
    policy_fn,
    *,
    max_fitness,  # has to be negative, as cmaes consider minization
    popsize,
    gensize,
    bounds,
    sigma,
    eval_iters,
    timesteps_per_actorbatch,
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,
    seed=0,
    optim_stepsize=3e-4,
    schedule='constant'  # annealing for stepsize parameters (epsilon and adam)
):
    set_global_seeds(seed)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = base_env.observation_space
    ac_space = base_env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    backup_pi = policy_fn(
        "backup_pi", ob_space, ac_space
    )  # Construct a network for every individual to adapt during the es evolution

    sol_dim = int(
        np.sum([
            np.prod(v.get_shape().as_list())
            for v in pi.get_trainable_variables()
        ]))
    pop_size = tf.placeholder(dtype=tf.float32, shape=[])
    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    tfkids_fit = tf.placeholder(dtype=tf.float32, shape=[
        popsize,
    ])
    tfkids = tf.placeholder(dtype=tf.float32, shape=[popsize, sol_dim])

    tfmean = tf.Variable(initial_value=tf.random_normal([
        sol_dim,
    ], 0., 1.),
                         dtype=tf.float32)
    tfcov = tf.Variable(initial_value=tf.eye(sol_dim), dtype=tf.float32)
    mvn = MultivariateNormalFullCovariance(loc=tfmean, covariance_matrix=tfcov)

    loss = -tf.reduce_mean(mvn.log_prob(tfkids) * tfkids_fit)
    train_op = tf.train.GradientDescentOptimizer(lrmult).minimize(loss)

    optimize = U.function([tfkids, tfkids_fit, lrmult], [train_op])
    reproduce = U.function([pop_size], [mvn.sample(popsize)])
    get_mean = U.function([], [tfmean])

    input_mean = tf.placeholder(dtype=tf.float32, shape=[
        sol_dim,
    ])
    assign_weights_to_mean = U.function([input_mean],
                                        [tf.assign(tfmean, input_mean)])

    U.initialize()

    pi_set_from_flat_params = U.SetFromFlat(pi.get_trainable_variables())
    pi_get_flat_params = U.GetFlat(pi.get_trainable_variables())

    global timesteps_so_far, episodes_so_far, iters_so_far, \
        tstart, lenbuffer, rewbuffer, best_fitness, eval_seq
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assign_backup_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(backup_v, newv) for (
                backup_v,
                newv) in zipsame(backup_pi.get_variables(), pi.get_variables())
        ])
    assign_new_eq_backup = U.function(
        [], [],
        updates=[
            tf.assign(newv, backup_v)
            for (newv, backup_v
                 ) in zipsame(pi.get_variables(), backup_pi.get_variables())
        ])

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    # Build generator for all solutions
    actors = []
    best_fitness = -np.inf

    eval_seq = traj_segment_generator_eval(pi,
                                           base_env,
                                           timesteps_per_actorbatch,
                                           stochastic=True)
    for i in range(popsize):
        newActor = traj_segment_generator(pi,
                                          base_env,
                                          timesteps_per_actorbatch,
                                          stochastic=True,
                                          eval_iters=eval_iters)
        actors.append(newActor)
    while True:
        if max_timesteps and timesteps_so_far >= max_timesteps:
            logger.log("Max time steps")
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            logger.log("Max episodes")
            break
        elif max_iters and iters_so_far >= max_iters:
            logger.log("Max iterations")
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            logger.log("Max time")
            break
        assign_backup_eq_new()  # backup current policy

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(
                1.0 - float(timesteps_so_far) / (max_timesteps / 2), 0)
        else:
            raise NotImplementedError

        logger.log("********** Generation %i ************" % iters_so_far)
        eval_seg = eval_seq.__next__()
        rewbuffer.extend(eval_seg["ep_rets"])
        lenbuffer.extend(eval_seg["ep_lens"])
        if iters_so_far == 0:
            result_record()
            assign_weights_to_mean(pi_get_flat_params())
        # mean = pi_get_flat_params()
        solutions = reproduce(popsize)
        ob_segs = None
        segs = []
        costs = []
        lens = []
        for id, solution in enumerate(solutions[0]):
            # pi.set_Flat_variables(solution)
            pi_set_from_flat_params(solution)
            seg = actors[id].__next__()
            costs.append(-np.mean(seg["ep_rets"]))
            lens.append(np.sum(seg["ep_lens"]))
            segs.append(seg)
            if ob_segs is None:
                ob_segs = {'ob': np.copy(seg['ob'])}
            else:
                ob_segs['ob'] = np.append(ob_segs['ob'], seg['ob'], axis=0)
            assign_new_eq_backup()
        optimize(solutions[0], np.array(costs), cur_lrmult * optim_stepsize)
        # fit_idx = np.array(costs).flatten().argsort()[:len(costs)]
        # solutions = np.array(solutions)[fit_idx]
        # costs = np.array(costs)[fit_idx]
        # segs = np.array(segs)[fit_idx]
        # # Weights decay
        # # costs, real_costs = fitness_shift(costs)
        # # costs, real_costs = compute_centered_ranks(costs)
        # l2_decay = compute_weight_decay(0.01, solutions)
        # costs += l2_decay
        # costs, real_costs = fitness_normalization(costs)
        # # best_solution = np.copy(solutions[0])
        # # best_fitness = -real_costs[0]
        # # rewbuffer.extend(segs[0]["ep_rets"])
        # # lenbuffer.extend(segs[0]["ep_lens"])
        # es.tell_real_seg(solutions = solutions, function_values = costs, real_f = real_costs, segs = segs)
        # best_solution = np.copy(es.result[0])
        # best_fitness = -es.result[1]
        # rewbuffer.extend(es.result[3]["ep_rets"])
        # lenbuffer.extend(es.result[3]["ep_lens"])
        # logger.log("Generation:", es.countiter)
        # logger.log("Best Solution Fitness:", best_fitness)
        pi_set_from_flat_params(get_mean()[0])

        ob = ob_segs["ob"]
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(
                ob)  # update running mean/std for observation normalization

        iters_so_far += 1
        episodes_so_far += sum(lens)
示例#24
0
def learn(
        env,
        test_env,
        policy_fn,
        *,
        timesteps_per_actorbatch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        # CMAES
    max_fitness,  # has to be negative, as cmaes consider minization
        popsize,
        gensize,
        bounds,
        sigma,
        eval_iters,
        max_v_train_iter,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,
        # time constraint
        callback=None,
        # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',
        # annealing for stepsize parameters (epsilon and adam)
        seed,
        env_id):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    backup_pi = policy_fn(
        "backup_pi", ob_space, ac_space
    )  # Construct a network for every individual to adapt during the es evolution
    pi_zero = policy_fn(
        "zero_pi", ob_space,
        ac_space)  # pi_0 will only be updated along with iterations

    reward = tf.placeholder(dtype=tf.float32, shape=[None])  # step rewards
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    ob = U.get_placeholder_cached(name="ob")
    next_ob = U.get_placeholder_cached(
        name="next_ob")  # next step observation for updating q function
    ac = U.get_placeholder_cached(
        name="act")  # action placeholder for computing q function

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    pi_adv = pi.qpred - pi.vpred
    adv_mean, adv_var = tf.nn.moments(pi_adv, axes=[0])
    normalized_pi_adv = (pi_adv - adv_mean) / tf.sqrt(adv_var)

    qf_loss = tf.reduce_mean(tf.square(reward + gamma * pi.vpred - pi.qpred))
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    qf_losses = [qf_loss]
    vf_losses = [vf_loss]
    pol_loss = -tf.reduce_mean(normalized_pi_adv)

    # Advantage function should be improved
    losses = [pol_loss, pol_entpen, meankl, meanent]
    loss_names = ["pol_surr_2", "pol_entpen", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    qf_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("qf")
    ]
    vf_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("vf")
    ]
    pol_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("pol")
    ]

    vf_lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                                vf_losses + [U.flatgrad(vf_loss, vf_var_list)])

    qf_lossandgrad = U.function([ob, ac, next_ob, lrmult, reward],
                                qf_losses + [U.flatgrad(qf_loss, qf_var_list)])

    qf_adam = MpiAdam(qf_var_list, epsilon=adam_epsilon)

    vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    assign_backup_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(backup_v, newv) for (
                backup_v,
                newv) in zipsame(backup_pi.get_variables(), pi.get_variables())
        ])
    assign_new_eq_backup = U.function(
        [], [],
        updates=[
            tf.assign(newv, backup_v)
            for (newv, backup_v
                 ) in zipsame(pi.get_variables(), backup_pi.get_variables())
        ])
    # Compute all losses

    mean_pi_actions = U.function(
        [ob], [pi.pd.mode()])  # later for computing pol_loss
    compute_pol_losses = U.function([ob, next_ob, ac], [pol_loss])

    U.initialize()

    get_pi_flat_params = U.GetFlat(pol_var_list)
    set_pi_flat_params = U.SetFromFlat(pol_var_list)

    vf_adam.sync()
    qf_adam.sync()

    global timesteps_so_far, episodes_so_far, iters_so_far, \
        tstart, lenbuffer, rewbuffer, tstart, ppo_timesteps_so_far, best_fitness

    episodes_so_far = 0
    timesteps_so_far = 0
    ppo_timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    best_fitness = np.inf

    eval_gen = traj_segment_generator_eval(pi,
                                           test_env,
                                           timesteps_per_actorbatch,
                                           stochastic=True)  # For evaluation
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True,
                                     eval_gen=eval_gen)  # For train V Func

    # Build generator for all solutions
    actors = []
    best_fitness = 0
    for i in range(popsize):
        newActor = traj_segment_generator(pi,
                                          env,
                                          timesteps_per_actorbatch,
                                          stochastic=True,
                                          eval_gen=eval_gen)
        actors.append(newActor)

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if max_timesteps and timesteps_so_far >= max_timesteps:
            print("Max time steps")
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            print("Max episodes")
            break
        elif max_iters and iters_so_far >= max_iters:
            print("Max iterations")
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            print("Max time")
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)

        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        # Generate new samples
        # Train V func
        for i in range(max_v_train_iter):
            logger.log("Iteration:" + str(iters_so_far) +
                       " - sub-train iter for V func:" + str(i))
            logger.log("Generate New Samples")
            seg = seg_gen.__next__()
            add_vtarg_and_adv(seg, gamma, lam)

            ob, ac, next_ob, atarg, reward, tdlamret, traj_idx = seg["ob"], seg["ac"], seg["next_ob"], seg["adv"], seg["rew"], seg["tdlamret"], \
                                                        seg["traj_index"]
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate
            d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                        shuffle=not pi.recurrent)
            optim_batchsize = optim_batchsize or ob.shape[0]

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(
                    ob)  # update running mean/std for normalization

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            # Train V function
            logger.log("Training V Func and Evaluating V Func Losses")
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):
                    *vf_losses, g = vf_lossandgrad(batch["ob"], batch["ac"],
                                                   batch["atarg"],
                                                   batch["vtarg"], cur_lrmult)
                    vf_adam.update(g, optim_stepsize * cur_lrmult)
                    losses.append(vf_losses)
                logger.log(fmt_row(13, np.mean(losses, axis=0)))

            d_q = Dataset(dict(ob=ob,
                               ac=ac,
                               next_ob=next_ob,
                               reward=reward,
                               atarg=atarg,
                               vtarg=tdlamret),
                          shuffle=not pi.recurrent)

            # Re-train q function
            logger.log("Training Q Func Evaluating Q Func Losses")
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d_q.iterate_once(optim_batchsize):
                    *qf_losses, g = qf_lossandgrad(batch["next_ob"],
                                                   batch["ac"], batch["ob"],
                                                   cur_lrmult, batch["reward"])
                    qf_adam.update(g, optim_stepsize * cur_lrmult)
                    losses.append(qf_losses)
                logger.log(fmt_row(13, np.mean(losses, axis=0)))

        # CMAES Train Policy
        assign_old_eq_new()  # set old parameter values to new parameter values
        assign_backup_eq_new()  # backup current policy
        flatten_weights = get_pi_flat_params()
        opt = cma.CMAOptions()
        opt['tolfun'] = max_fitness
        opt['popsize'] = popsize
        opt['maxiter'] = gensize
        opt['verb_disp'] = 0
        opt['verb_log'] = 0
        opt['seed'] = seed
        opt['AdaptSigma'] = True
        es = cma.CMAEvolutionStrategy(flatten_weights, sigma, opt)
        while True:
            if es.countiter >= gensize:
                logger.log("Max generations for current layer")
                break
            logger.log("Iteration:" + str(iters_so_far) +
                       " - sub-train Generation for Policy:" +
                       str(es.countiter))
            logger.log("Sigma=" + str(es.sigma))
            solutions = es.ask()
            costs = []
            lens = []

            assign_backup_eq_new()  # backup current policy

            for id, solution in enumerate(solutions):
                set_pi_flat_params(solution)
                losses = []
                cost = compute_pol_losses(ob, ob, mean_pi_actions(ob)[0])
                costs.append(cost[0])
                assign_new_eq_backup()
            # Weights decay
            l2_decay = compute_weight_decay(0.99, solutions)
            costs += l2_decay
            # costs, real_costs = fitness_normalization(costs)
            costs, real_costs = fitness_rank(costs)
            es.tell_real_seg(solutions=solutions,
                             function_values=costs,
                             real_f=real_costs,
                             segs=None)
            best_solution = es.result[0]
            best_fitness = es.result[1]
            logger.log("Best Solution Fitness:" + str(best_fitness))
            set_pi_flat_params(best_solution)

        iters_so_far += 1
        episodes_so_far += sum(lens)
示例#25
0
def learn(
        make_env,
        make_policy,
        *,
        n_episodes,
        horizon,
        delta,
        gamma,
        max_iters,
        sampler=None,
        use_natural_gradient=False,  #can be 'exact', 'approximate'
        fisher_reg=1e-2,
        iw_method='is',
        iw_norm='none',
        bound='J',
        line_search_type='parabola',
        save_weights=False,
        improvement_tol=0.,
        center_return=False,
        render_after=None,
        max_offline_iters=100,
        callback=None,
        clipping=False,
        entropy='none',
        positive_return=False,
        reward_clustering='none'):

    np.set_printoptions(precision=3)
    max_samples = horizon * n_episodes

    if line_search_type == 'binary':
        line_search = line_search_binary
    elif line_search_type == 'parabola':
        line_search = line_search_parabola
    else:
        raise ValueError()

    # Building the environment
    env = make_env()
    ob_space = env.observation_space
    ac_space = env.action_space

    # Building the policy
    pi = make_policy('pi', ob_space, ac_space)
    oldpi = make_policy('oldpi', ob_space, ac_space)

    all_var_list = pi.get_trainable_variables()
    var_list = [
        v for v in all_var_list if v.name.split('/')[1].startswith('pol')
    ]

    shapes = [U.intprod(var.get_shape().as_list()) for var in var_list]
    n_parameters = sum(shapes)

    # Placeholders
    ob_ = ob = U.get_placeholder_cached(name='ob')
    ac_ = pi.pdtype.sample_placeholder([max_samples], name='ac')
    mask_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='mask')
    rew_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='rew')
    disc_rew_ = tf.placeholder(dtype=tf.float32,
                               shape=(max_samples),
                               name='disc_rew')
    gradient_ = tf.placeholder(dtype=tf.float32,
                               shape=(n_parameters, 1),
                               name='gradient')
    iter_number_ = tf.placeholder(dtype=tf.int32, name='iter_number')
    losses_with_name = []

    # Policy densities
    target_log_pdf = pi.pd.logp(ac_)
    behavioral_log_pdf = oldpi.pd.logp(ac_)
    log_ratio = target_log_pdf - behavioral_log_pdf

    # Split operations
    disc_rew_split = tf.stack(tf.split(disc_rew_ * mask_, n_episodes))
    rew_split = tf.stack(tf.split(rew_ * mask_, n_episodes))
    log_ratio_split = tf.stack(tf.split(log_ratio * mask_, n_episodes))
    target_log_pdf_split = tf.stack(
        tf.split(target_log_pdf * mask_, n_episodes))
    behavioral_log_pdf_split = tf.stack(
        tf.split(behavioral_log_pdf * mask_, n_episodes))
    mask_split = tf.stack(tf.split(mask_, n_episodes))

    # Renyi divergence
    emp_d2_split = tf.stack(
        tf.split(pi.pd.renyi(oldpi.pd, 2) * mask_, n_episodes))
    emp_d2_cum_split = tf.reduce_sum(emp_d2_split, axis=1)
    empirical_d2 = tf.reduce_mean(tf.exp(emp_d2_cum_split))

    # Return
    ep_return = tf.reduce_sum(mask_split * disc_rew_split, axis=1)
    if clipping:
        rew_split = tf.clip_by_value(rew_split, -1, 1)

    if center_return:
        ep_return = ep_return - tf.reduce_mean(ep_return)
        rew_split = rew_split - (tf.reduce_sum(rew_split) /
                                 (tf.reduce_sum(mask_split) + 1e-24))

    discounter = [pow(gamma, i) for i in range(0, horizon)]  # Decreasing gamma
    discounter_tf = tf.constant(discounter)
    disc_rew_split = rew_split * discounter_tf

    return_mean = tf.reduce_mean(ep_return)
    return_std = U.reduce_std(ep_return)
    return_max = tf.reduce_max(ep_return)
    return_min = tf.reduce_min(ep_return)
    return_abs_max = tf.reduce_max(tf.abs(ep_return))
    return_step_max = tf.reduce_max(tf.abs(rew_split))  # Max step reward
    return_step_mean = tf.abs(tf.reduce_mean(rew_split))
    positive_step_return_max = tf.maximum(0.0, tf.reduce_max(rew_split))
    negative_step_return_max = tf.maximum(0.0, tf.reduce_max(-rew_split))
    return_step_maxmin = tf.abs(positive_step_return_max -
                                negative_step_return_max)

    losses_with_name.extend([(return_mean, 'InitialReturnMean'),
                             (return_max, 'InitialReturnMax'),
                             (return_min, 'InitialReturnMin'),
                             (return_std, 'InitialReturnStd'),
                             (empirical_d2, 'EmpiricalD2'),
                             (return_step_max, 'ReturnStepMax'),
                             (return_step_maxmin, 'ReturnStepMaxmin')])

    if iw_method == 'pdis':
        # log_ratio_split cumulative sum
        log_ratio_cumsum = tf.cumsum(log_ratio_split, axis=1)
        # Exponentiate
        ratio_cumsum = tf.exp(log_ratio_cumsum)
        # Multiply by the step-wise reward (not episode)
        ratio_reward = ratio_cumsum * disc_rew_split
        # Average on episodes
        ratio_reward_per_episode = tf.reduce_sum(ratio_reward, axis=1)
        w_return_mean = tf.reduce_sum(ratio_reward_per_episode,
                                      axis=0) / n_episodes
        # Get d2(w0:t) with mask
        d2_w_0t = tf.exp(tf.cumsum(emp_d2_split,
                                   axis=1)) * mask_split  # LEAVE THIS OUTSIDE
        # Sum d2(w0:t) over timesteps
        episode_d2_0t = tf.reduce_sum(d2_w_0t, axis=1)
        # Sample variance
        J_sample_variance = (1 / (n_episodes - 1)) * tf.reduce_sum(
            tf.square(ratio_reward_per_episode - w_return_mean))
        losses_with_name.append((J_sample_variance, 'J_sample_variance'))
        losses_with_name.extend([(tf.reduce_max(ratio_cumsum), 'MaxIW'),
                                 (tf.reduce_min(ratio_cumsum), 'MinIW'),
                                 (tf.reduce_mean(ratio_cumsum), 'MeanIW'),
                                 (U.reduce_std(ratio_cumsum), 'StdIW')])
        losses_with_name.extend([(tf.reduce_max(d2_w_0t), 'MaxD2w0t'),
                                 (tf.reduce_min(d2_w_0t), 'MinD2w0t'),
                                 (tf.reduce_mean(d2_w_0t), 'MeanD2w0t'),
                                 (U.reduce_std(d2_w_0t), 'StdD2w0t')])

    elif iw_method == 'is':
        iw = tf.exp(tf.reduce_sum(log_ratio_split, axis=1))
        if iw_norm == 'none':
            iwn = iw / n_episodes
            w_return_mean = tf.reduce_sum(iwn * ep_return)
            J_sample_variance = (1 / (n_episodes - 1)) * tf.reduce_sum(
                tf.square(iw * ep_return - w_return_mean))
            losses_with_name.append((J_sample_variance, 'J_sample_variance'))
        elif iw_norm == 'sn':
            iwn = iw / tf.reduce_sum(iw)
            w_return_mean = tf.reduce_sum(iwn * ep_return)
        elif iw_norm == 'regression':
            iwn = iw / n_episodes
            mean_iw = tf.reduce_mean(iw)
            beta = tf.reduce_sum(
                (iw - mean_iw) * ep_return * iw) / (tf.reduce_sum(
                    (iw - mean_iw)**2) + 1e-24)
            w_return_mean = tf.reduce_mean(iw * ep_return - beta * (iw - 1))
        else:
            raise NotImplementedError()
        ess_classic = tf.linalg.norm(iw, 1)**2 / tf.linalg.norm(iw, 2)**2
        sqrt_ess_classic = tf.linalg.norm(iw, 1) / tf.linalg.norm(iw, 2)
        ess_renyi = n_episodes / empirical_d2
        losses_with_name.extend([(tf.reduce_max(iwn), 'MaxIWNorm'),
                                 (tf.reduce_min(iwn), 'MinIWNorm'),
                                 (tf.reduce_mean(iwn), 'MeanIWNorm'),
                                 (U.reduce_std(iwn), 'StdIWNorm'),
                                 (tf.reduce_max(iw), 'MaxIW'),
                                 (tf.reduce_min(iw), 'MinIW'),
                                 (tf.reduce_mean(iw), 'MeanIW'),
                                 (U.reduce_std(iw), 'StdIW'),
                                 (ess_classic, 'ESSClassic'),
                                 (ess_renyi, 'ESSRenyi')])
    elif iw_method == 'rbis':
        # Check if we need to cluster rewards
        rew_clustering_options = reward_clustering.split(':')
        if reward_clustering == 'none':
            pass  # Do nothing
        elif rew_clustering_options[0] == 'global':
            assert len(
                rew_clustering_options
            ) == 2, "Reward clustering: Provide the correct number of parameters"
            N = int(rew_clustering_options[1])
            tf.add_to_collection(
                'prints',
                tf.Print(ep_return, [ep_return], 'ep_return', summarize=20))
            global_rew_min = tf.Variable(float('+inf'), trainable=False)
            global_rew_max = tf.Variable(float('-inf'), trainable=False)
            rew_min = tf.reduce_min(ep_return)
            rew_max = tf.reduce_max(ep_return)
            global_rew_min = tf.assign(global_rew_min,
                                       tf.minimum(global_rew_min, rew_min))
            global_rew_max = tf.assign(global_rew_max,
                                       tf.maximum(global_rew_max, rew_max))
            interval_size = (global_rew_max - global_rew_min) / N
            ep_return = tf.floordiv(ep_return, interval_size) * interval_size
        elif rew_clustering_options[0] == 'batch':
            assert len(
                rew_clustering_options
            ) == 2, "Reward clustering: Provide the correct number of parameters"
            N = int(rew_clustering_options[1])
            rew_min = tf.reduce_min(ep_return)
            rew_max = tf.reduce_max(ep_return)
            interval_size = (rew_max - rew_min) / N
            ep_return = tf.floordiv(ep_return, interval_size) * interval_size
        elif rew_clustering_options[0] == 'manual':
            assert len(
                rew_clustering_options
            ) == 4, "Reward clustering: Provide the correct number of parameters"
            N, rew_min, rew_max = map(int, rew_clustering_options[1:])
            interval_size = (rew_max - rew_min) / N
            # Clip to avoid overflow and cluster
            ep_return = tf.clip_by_value(ep_return, rew_min, rew_max)
            ep_return = tf.floordiv(ep_return, interval_size) * interval_size
        else:
            raise Exception('Unrecognized reward clustering scheme.')

        # Get pdfs for episodes
        target_log_pdf_episode = tf.reduce_sum(target_log_pdf_split, axis=1)
        behavioral_log_pdf_episode = tf.reduce_sum(behavioral_log_pdf_split,
                                                   axis=1)
        # Normalize log_proba (avoid as overflows as possible)
        normalization_factor = tf.reduce_mean(
            tf.stack([target_log_pdf_episode, behavioral_log_pdf_episode]))
        target_norm_log_pdf_episode = target_log_pdf_episode - normalization_factor
        behavioral_norm_log_pdf_episode = behavioral_log_pdf_episode - normalization_factor
        # Exponentiate
        target_pdf_episode = tf.clip_by_value(
            tf.cast(tf.exp(target_norm_log_pdf_episode), tf.float64), 1e-300,
            1e+300)
        behavioral_pdf_episode = tf.clip_by_value(
            tf.cast(tf.exp(behavioral_norm_log_pdf_episode), tf.float64),
            1e-300, 1e+300)
        tf.add_to_collection(
            'asserts',
            tf.assert_positive(target_pdf_episode, name='target_pdf_positive'))
        tf.add_to_collection(
            'asserts',
            tf.assert_positive(behavioral_pdf_episode,
                               name='behavioral_pdf_positive'))
        # Compute the merging matrix (reward-clustering) and the number of clusters
        reward_unique, reward_indexes = tf.unique(ep_return)
        episode_clustering_matrix = tf.cast(
            tf.one_hot(reward_indexes, n_episodes), tf.float64)
        max_index = tf.reduce_max(reward_indexes) + 1
        trajectories_per_cluster = tf.reduce_sum(episode_clustering_matrix,
                                                 axis=0)[:max_index]
        tf.add_to_collection(
            'asserts',
            tf.assert_positive(tf.reduce_sum(episode_clustering_matrix,
                                             axis=0)[:max_index],
                               name='clustering_matrix'))
        # Get the clustered pdfs
        clustered_target_pdf = tf.matmul(
            tf.reshape(target_pdf_episode, (1, -1)),
            episode_clustering_matrix)[0][:max_index]
        clustered_behavioral_pdf = tf.matmul(
            tf.reshape(behavioral_pdf_episode, (1, -1)),
            episode_clustering_matrix)[0][:max_index]
        tf.add_to_collection(
            'asserts',
            tf.assert_positive(clustered_target_pdf,
                               name='clust_target_pdf_positive'))
        tf.add_to_collection(
            'asserts',
            tf.assert_positive(clustered_behavioral_pdf,
                               name='clust_behavioral_pdf_positive'))
        # Compute the J
        ratio_clustered = clustered_target_pdf / clustered_behavioral_pdf
        #ratio_reward = tf.cast(ratio_clustered, tf.float32) * reward_unique                                                  # ---- No cluster cardinality
        ratio_reward = tf.cast(ratio_clustered,
                               tf.float32) * reward_unique * tf.cast(
                                   trajectories_per_cluster,
                                   tf.float32)  # ---- Cluster cardinality
        #w_return_mean = tf.reduce_sum(ratio_reward) / tf.cast(max_index, tf.float32)                                         # ---- No cluster cardinality
        w_return_mean = tf.reduce_sum(ratio_reward) / tf.cast(
            n_episodes, tf.float32)  # ---- Cluster cardinality
        # Divergences
        ess_classic = tf.linalg.norm(ratio_reward, 1)**2 / tf.linalg.norm(
            ratio_reward, 2)**2
        sqrt_ess_classic = tf.linalg.norm(ratio_reward, 1) / tf.linalg.norm(
            ratio_reward, 2)
        ess_renyi = n_episodes / empirical_d2
        # Summaries
        losses_with_name.extend([(tf.reduce_max(ratio_clustered), 'MaxIW'),
                                 (tf.reduce_min(ratio_clustered), 'MinIW'),
                                 (tf.reduce_mean(ratio_clustered), 'MeanIW'),
                                 (U.reduce_std(ratio_clustered), 'StdIW'),
                                 (1 - (max_index / n_episodes),
                                  'RewardCompression'),
                                 (ess_classic, 'ESSClassic'),
                                 (ess_renyi, 'ESSRenyi')])
    else:
        raise NotImplementedError()

    if bound == 'J':
        bound_ = w_return_mean
    elif bound == 'std-d2':
        bound_ = w_return_mean - tf.sqrt(
            (1 - delta) / (delta * ess_renyi)) * return_std
    elif bound == 'max-d2':
        var_estimate = tf.sqrt(
            (1 - delta) / (delta * ess_renyi)) * return_abs_max
        bound_ = w_return_mean - tf.sqrt(
            (1 - delta) / (delta * ess_renyi)) * return_abs_max
    elif bound == 'max-ess':
        bound_ = w_return_mean - tf.sqrt(
            (1 - delta) / delta) / sqrt_ess_classic * return_abs_max
    elif bound == 'std-ess':
        bound_ = w_return_mean - tf.sqrt(
            (1 - delta) / delta) / sqrt_ess_classic * return_std
    elif bound == 'pdis-max-d2':
        # Discount factor
        if gamma >= 1:
            discounter = [
                float(1 + 2 * (horizon - t - 1)) for t in range(0, horizon)
            ]
        else:

            def f(t):
                return pow(gamma, 2 * t) + (
                    2 * pow(gamma, t) *
                    (pow(gamma, t + 1) - pow(gamma, horizon))) / (1 - gamma)

            discounter = [f(t) for t in range(0, horizon)]
        discounter_tf = tf.constant(discounter)
        mean_episode_d2 = tf.reduce_sum(
            d2_w_0t, axis=0) / (tf.reduce_sum(mask_split, axis=0) + 1e-24)
        discounted_d2 = mean_episode_d2 * discounter_tf  # Discounted d2
        discounted_total_d2 = tf.reduce_sum(discounted_d2,
                                            axis=0)  # Sum over time
        bound_ = w_return_mean - tf.sqrt(
            (1 - delta) * discounted_total_d2 /
            (delta * n_episodes)) * return_step_max
    elif bound == 'pdis-mean-d2':
        # Discount factor
        if gamma >= 1:
            discounter = [
                float(1 + 2 * (horizon - t - 1)) for t in range(0, horizon)
            ]
        else:

            def f(t):
                return pow(gamma, 2 * t) + (
                    2 * pow(gamma, t) *
                    (pow(gamma, t + 1) - pow(gamma, horizon))) / (1 - gamma)

            discounter = [f(t) for t in range(0, horizon)]
        discounter_tf = tf.constant(discounter)
        mean_episode_d2 = tf.reduce_sum(
            d2_w_0t, axis=0) / (tf.reduce_sum(mask_split, axis=0) + 1e-24)
        discounted_d2 = mean_episode_d2 * discounter_tf  # Discounted d2
        discounted_total_d2 = tf.reduce_sum(discounted_d2,
                                            axis=0)  # Sum over time
        bound_ = w_return_mean - tf.sqrt(
            (1 - delta) * discounted_total_d2 /
            (delta * n_episodes)) * return_step_mean
    else:
        raise NotImplementedError()

    # Policy entropy for exploration
    ent = pi.pd.entropy()
    meanent = tf.reduce_mean(ent)
    losses_with_name.append((meanent, 'MeanEntropy'))
    # Add policy entropy bonus
    if entropy != 'none':
        scheme, v1, v2 = entropy.split(':')
        if scheme == 'step':
            entcoeff = tf.cond(iter_number_ < int(v2), lambda: float(v1),
                               lambda: float(0.0))
            losses_with_name.append((entcoeff, 'EntropyCoefficient'))
            entbonus = entcoeff * meanent
            bound_ = bound_ + entbonus
        elif scheme == 'lin':
            ip = tf.cast(iter_number_ / max_iters, tf.float32)
            entcoeff_decay = tf.maximum(
                0.0,
                float(v2) + (float(v1) - float(v2)) * (1.0 - ip))
            losses_with_name.append((entcoeff_decay, 'EntropyCoefficient'))
            entbonus = entcoeff_decay * meanent
            bound_ = bound_ + entbonus
        elif scheme == 'exp':
            ent_f = tf.exp(
                -tf.abs(tf.reduce_mean(iw) - 1) * float(v2)) * float(v1)
            losses_with_name.append((ent_f, 'EntropyCoefficient'))
            bound_ = bound_ + ent_f * meanent
        else:
            raise Exception('Unrecognized entropy scheme.')

    losses_with_name.append((w_return_mean, 'ReturnMeanIW'))
    losses_with_name.append((bound_, 'Bound'))
    losses, loss_names = map(list, zip(*losses_with_name))

    if use_natural_gradient:
        p = tf.placeholder(dtype=tf.float32, shape=[None])
        target_logpdf_episode = tf.reduce_sum(target_log_pdf_split *
                                              mask_split,
                                              axis=1)
        grad_logprob = U.flatgrad(
            tf.stop_gradient(iwn) * target_logpdf_episode, var_list)
        dot_product = tf.reduce_sum(grad_logprob * p)
        hess_logprob = U.flatgrad(dot_product, var_list)
        compute_linear_operator = U.function([p, ob_, ac_, disc_rew_, mask_],
                                             [-hess_logprob])

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    assert_ops = tf.group(*tf.get_collection('asserts'))
    print_ops = tf.group(*tf.get_collection('prints'))

    compute_lossandgrad = U.function(
        [ob_, ac_, rew_, disc_rew_, mask_, iter_number_],
        losses + [U.flatgrad(bound_, var_list), assert_ops, print_ops])
    compute_grad = U.function(
        [ob_, ac_, rew_, disc_rew_, mask_, iter_number_],
        [U.flatgrad(bound_, var_list), assert_ops, print_ops])
    compute_bound = U.function(
        [ob_, ac_, rew_, disc_rew_, mask_, iter_number_],
        [bound_, assert_ops, print_ops])
    compute_losses = U.function(
        [ob_, ac_, rew_, disc_rew_, mask_, iter_number_], losses)
    #compute_temp = U.function([ob_, ac_, rew_, disc_rew_, mask_], [ratio_cumsum, discounted_ratio])

    set_parameter = U.SetFromFlat(var_list)
    get_parameter = U.GetFlat(var_list)

    if sampler is None:
        seg_gen = traj_segment_generator(pi,
                                         env,
                                         n_episodes,
                                         horizon,
                                         stochastic=True)
        sampler = type("SequentialSampler", (object, ), {
            "collect": lambda self, _: seg_gen.__next__()
        })()

    U.initialize()

    # Starting optimizing

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=n_episodes)
    rewbuffer = deque(maxlen=n_episodes)

    while True:

        iters_so_far += 1

        if render_after is not None and iters_so_far % render_after == 0:
            if hasattr(env, 'render'):
                render(env, pi, horizon)

        if callback:
            callback(locals(), globals())

        if iters_so_far >= max_iters:
            print('Finised...')
            break

        logger.log('********** Iteration %i ************' % iters_so_far)

        theta = get_parameter()

        with timed('sampling'):
            seg = sampler.collect(theta)

        add_disc_rew(seg, gamma)

        lens, rets = seg['ep_lens'], seg['ep_rets']
        lenbuffer.extend(lens)
        rewbuffer.extend(rets)
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)

        args = ob, ac, rew, disc_rew, mask, iter_number = seg['ob'], seg[
            'ac'], seg['rew'], seg['disc_rew'], seg['mask'], iters_so_far

        assign_old_eq_new()

        def evaluate_loss():
            loss = compute_bound(*args)
            return loss[0]

        def evaluate_gradient():
            gradient = compute_grad(*args)
            return gradient[0]

        if use_natural_gradient:

            def evaluate_fisher_vector_prod(x):
                return compute_linear_operator(x, *args)[0] + fisher_reg * x

            def evaluate_natural_gradient(g):
                return cg(evaluate_fisher_vector_prod,
                          g,
                          cg_iters=10,
                          verbose=0)
        else:
            evaluate_natural_gradient = None

        with timed('summaries before'):
            logger.record_tabular("Iteration", iters_so_far)
            logger.record_tabular("InitialBound", evaluate_loss())
            logger.record_tabular("EpLenMean", np.mean(lenbuffer))
            logger.record_tabular("EpRewMean", np.mean(rewbuffer))
            logger.record_tabular("EpThisIter", len(lens))
            logger.record_tabular("EpisodesSoFar", episodes_so_far)
            logger.record_tabular("TimestepsSoFar", timesteps_so_far)
            logger.record_tabular("TimeElapsed", time.time() - tstart)

        if save_weights:
            logger.record_tabular('Weights', str(get_parameter()))
            import pickle
            file = open('checkpoint.pkl', 'wb')
            pickle.dump(theta, file)

        with timed("offline optimization"):
            theta, improvement = optimize_offline(
                theta,
                set_parameter,
                line_search,
                evaluate_loss,
                evaluate_gradient,
                evaluate_natural_gradient,
                max_offline_ite=max_offline_iters)

        set_parameter(theta)

        with timed('summaries after'):
            meanlosses = np.array(compute_losses(*args))
            for (lossname, lossval) in zip(loss_names, meanlosses):
                logger.record_tabular(lossname, lossval)

        logger.dump_tabular()

    env.close()
示例#26
0
def learn(
        env,
        policy_fn,
        *,
        timesteps_per_batch,  # what to train on
        max_kl,
        cg_iters,
        gamma,
        lam,  # advantage estimation
        entc=0.5,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        i_trial):
    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space, ac_space)
    oldpi = policy_fn("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])
    entp = tf.placeholder(dtype=tf.float32, shape=[])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)

    entbonus = entp * meanent

    vferr = tf.reduce_mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "loss_ent"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [
        v for v in all_var_list if v.name.split("/")[1].startswith("pol")
    ]
    vf_var_list = [
        v for v in all_var_list if v.name.split("/")[1].startswith("vf")
    ]
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, entp], losses)
    compute_lossandgrad = U.function([ob, ac, atarg, entp], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    tf.global_variables_initializer()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     gamma=gamma)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
    drwdsbuffer = deque(maxlen=40)

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # entcoeff = max(entc - float(iters_so_far) / float(max_iters), 0.01)
        entcoeff = 0.0

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        args = seg["ob"], seg["ac"], atarg
        fvpargs = [arr[::5] for arr in args]

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        assign_old_eq_new()  # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args, entcoeff)
        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)
        if np.allclose(g, 0):
            print("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product,
                             g,
                             cg_iters=cg_iters,
                             verbose=rank == 0)
            assert np.isfinite(stepdir).all()
            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                meanlosses = surr, kl, *_ = allmean(
                    np.array(compute_losses(*args, entcoeff)))
                improve = surr - surrbefore
                print("Expected: %.3f Actual: %.3f" %
                      (expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    print("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    print("violated KL constraint. shrinking step.")
                elif improve < 0:
                    print("surrogate didn't improve. shrinking step.")
                else:
                    print("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                print("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather(
                    (thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                assert all(
                    np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.logkv(lossname, lossval)

        with timed("vf"):
            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (seg["ob"], seg["tdlamret"]),
                        include_final_partial_batch=False,
                        batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        logger.logkv("ev_tdlam_before",
                     explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_drwds"]
                   )  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, drwds = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        drwdsbuffer.extend(drwds)

        logger.logkv("EpLenMean", np.mean(lenbuffer))
        logger.logkv("EpRewMean", np.mean(rewbuffer))
        logger.logkv("EpThisIter", len(lens))
        logger.logkv("EpDRewMean", np.mean(drwdsbuffer))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.logkv("EpisodesSoFar", episodes_so_far)
        logger.logkv("TimestepsSoFar", timesteps_so_far)
        logger.logkv("TimeElapsed", time.time() - tstart)
        logger.logkv('trial', i_trial)
        logger.logkv("Iteration", iters_so_far)
        logger.logkv("Name", 'TRPO')

        if rank == 0:
            logger.dump_tabular()
示例#27
0
def learn(
        env,
        policy_func,
        reward_giver,
        expert_dataset,
        rank,
        pretrained,
        pretrained_weight,
        *,
        #                   0
        g_step,
        d_step,
        entcoeff,
        save_per_iter,
        #                         1024
        ckpt_dir,
        log_dir,
        timesteps_per_batch,
        task_name,
        robot_name,
        gamma,
        lam,
        max_kl,
        cg_iters,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        d_stepsize=3e-4,
        vf_iters=3,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        callback=None):

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi",
                     ob_space,
                     ac_space,
                     reuse=(pretrained_weight != None))
    oldpi = policy_func("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = entcoeff * meanent

    vferr = tf.reduce_mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [
        v for v in all_var_list
        if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")
    ]
    vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")]
    assert len(var_list) == len(vf_var_list) + 1
    d_adam = MpiAdam(reward_giver.get_trainable_variables())
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  # pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    d_adam.sync()
    vfadam.sync()
    if rank == 0:
        print("Init param sum", th_init.sum(), flush=True)
    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     reward_giver,
                                     timesteps_per_batch,
                                     stochastic=True)
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
    true_rewbuffer = deque(maxlen=40)
    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    # if provide pretrained weight
    if pretrained_weight is not None:
        U.load_state(pretrained_weight, var_list=pi.get_variables())

    if robot_name == 'scara':
        summary_writer = tf.summary.FileWriter(
            '/home/yue/gym-gazebo/Tensorboard/scara',
            graph=tf.get_default_graph())
    elif robot_name == 'mara':
        # summary_writer=tf.summary.FileWriter('/home/yue/gym-gazebo/Tensorboard/mara/down-home_position',graph=tf.get_default_graph())
        summary_writer = tf.summary.FileWriter(
            '/home/yue/gym-gazebo/Tensorboard/mara/collisions_model/',
            graph=tf.get_default_graph())

    while True:
        if callback:
            callback(locals(), globals())

        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break

        logger.log("********** Iteration %i ************" % iters_so_far)

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        # ------------------ Update G ------------------
        logger.log("Optimizing Policy...")
        for _ in range(g_step):
            with timed("sampling"):
                seg = seg_gen.__next__()
            add_vtarg_and_adv(seg, gamma, lam)
            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(ob)  # update running mean/std for policy

            args = seg["ob"], seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=rank == 0)
                assert np.isfinite(stepdir).all()
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / max_kl)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(
                        np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" %
                               (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > max_kl * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather(
                        (thnew.sum(),
                         vfadam.getflat().sum()))  # list of tuples
                    assert all(
                        np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
            with timed("vf"):
                for _ in range(vf_iters):
                    for (mbob, mbret) in dataset.iterbatches(
                        (seg["ob"], seg["tdlamret"]),
                            include_final_partial_batch=False,
                            batch_size=128):

                        if hasattr(pi, "ob_rms"):
                            pi.ob_rms.update(
                                mbob)  # update running mean/std for policy
                        if nworkers != 1:
                            g = allmean(compute_vflossandgrad(mbob, mbret))
                        else:
                            g = compute_vflossandgrad(mbob, mbret)

                        vfadam.update(g, vf_stepsize)

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        # ------------------ Update D ------------------
        logger.log("Optimizing Discriminator...")
        logger.log(fmt_row(13, reward_giver.loss_name))
        ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob))
        batch_size = len(ob) // d_step
        d_losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        for ob_batch, ac_batch in dataset.iterbatches(
            (ob, ac), include_final_partial_batch=False,
                batch_size=batch_size):
            ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch))
            # update running mean/std for reward_giver
            if hasattr(reward_giver, "obs_rms"):
                reward_giver.obs_rms.update(
                    np.concatenate((ob_batch, ob_expert), 0))
            *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch,
                                                     ob_expert, ac_expert)
            if nworkers != 1:
                d_adam.update(allmean(g), d_stepsize)
            else:
                d_adam.update(g, d_stepsize)

            d_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))
        g_loss_summary = tf.Summary(value=[
            tf.Summary.Value(tag="g_loss",
                             simple_value=np.mean(d_losses[0][0]))
        ])
        summary_writer.add_summary(g_loss_summary, timesteps_so_far)

        d_loss_summary = tf.Summary(value=[
            tf.Summary.Value(tag="d_loss",
                             simple_value=np.mean(d_losses[0][1]))
        ])
        summary_writer.add_summary(d_loss_summary, timesteps_so_far)

        entropy_summary = tf.Summary(value=[
            tf.Summary.Value(tag="entropy",
                             simple_value=np.mean(d_losses[0][2]))
        ])
        summary_writer.add_summary(entropy_summary, timesteps_so_far)

        entropy_loss_summary = tf.Summary(value=[
            tf.Summary.Value(tag="entropy_loss",
                             simple_value=np.mean(d_losses[0][3]))
        ])
        summary_writer.add_summary(entropy_loss_summary, timesteps_so_far)

        g_acc_summary = tf.Summary(value=[
            tf.Summary.Value(tag="g_acc", simple_value=np.mean(d_losses[0][4]))
        ])
        summary_writer.add_summary(g_acc_summary, timesteps_so_far)

        expert_acc_summary = tf.Summary(value=[
            tf.Summary.Value(tag="expert_acc",
                             simple_value=np.mean(d_losses[0][5]))
        ])
        summary_writer.add_summary(expert_acc_summary, timesteps_so_far)

        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]
                   )  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
        true_rewbuffer.extend(true_rets)
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        summary = tf.Summary(value=[
            tf.Summary.Value(tag="MeanDiscriminator",
                             simple_value=np.mean(rewbuffer))
        ])
        summary_writer.add_summary(summary, timesteps_so_far)

        truesummary = tf.Summary(value=[
            tf.Summary.Value(tag="MeanGenerator",
                             simple_value=np.mean(true_rewbuffer))
        ])
        summary_writer.add_summary(truesummary, timesteps_so_far)

        true_rets_summary = tf.Summary(value=[
            tf.Summary.Value(tag="Generator", simple_value=np.mean(true_rets))
        ])
        summary_writer.add_summary(true_rets_summary, timesteps_so_far)

        len_summary = tf.Summary(value=[
            tf.Summary.Value(tag="Length", simple_value=np.mean(lenbuffer))
        ])
        summary_writer.add_summary(len_summary, timesteps_so_far)

        optimgain_summary = tf.Summary(value=[
            tf.Summary.Value(tag="Optimgain",
                             simple_value=np.mean(meanlosses[0]))
        ])
        summary_writer.add_summary(optimgain_summary, timesteps_so_far)

        meankl_summary = tf.Summary(value=[
            tf.Summary.Value(tag="Meankl", simple_value=np.mean(meanlosses[1]))
        ])
        summary_writer.add_summary(meankl_summary, timesteps_so_far)

        entloss_summary = tf.Summary(value=[
            tf.Summary.Value(tag="Entloss",
                             simple_value=np.mean(meanlosses[2]))
        ])
        summary_writer.add_summary(entloss_summary, timesteps_so_far)

        surrgain_summary = tf.Summary(value=[
            tf.Summary.Value(tag="Surrgain",
                             simple_value=np.mean(meanlosses[3]))
        ])
        summary_writer.add_summary(surrgain_summary, timesteps_so_far)

        entropy_summary = tf.Summary(value=[
            tf.Summary.Value(tag="Entropy",
                             simple_value=np.mean(meanlosses[4]))
        ])
        summary_writer.add_summary(entropy_summary, timesteps_so_far)

        epThisIter_summary = tf.Summary(value=[
            tf.Summary.Value(tag="EpThisIter", simple_value=np.mean(len(lens)))
        ])
        summary_writer.add_summary(epThisIter_summary, timesteps_so_far)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("MeanDiscriminator", np.mean(rewbuffer))

        # Save model
        if robot_name == 'scara':
            if iters_so_far % save_per_iter == 0:
                if np.mean(rewbuffer) <= 200 or np.mean(
                        true_rewbuffer) >= -100:
                    task_name = str(iters_so_far)
                    fname = os.path.join(ckpt_dir, task_name)
                    os.makedirs(os.path.dirname(fname), exist_ok=True)
                    saver = tf.train.Saver()
                    saver.save(tf.get_default_session(), fname)
                    if iters_so_far == 2000:
                        break

        elif robot_name == 'mara':
            if iters_so_far % save_per_iter == 0:
                # if np.mean(rewbuffer) <= 300 or np.mean(true_rewbuffer) >= -400:
                task_name = str(iters_so_far)
                fname = os.path.join(ckpt_dir, task_name)
                os.makedirs(os.path.dirname(fname), exist_ok=True)
                saver = tf.train.Saver()
                saver.save(tf.get_default_session(), fname)
                if iters_so_far == 5000:
                    break

        logger.record_tabular("MeanGenerator", np.mean(true_rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()
示例#28
0
    def __init__(self, a_name, env, policy_func, par):

        self.env = env
        self.timesteps_per_batch = par.timesteps_per_batch
        self.max_kl = par.max_kl
        self.cg_iters = par.cg_iters
        self.gamma = par.gamma
        self.lam = par.lam  # advantage estimation
        self.entcoeff = par.entcoeff
        self.cg_damping = par.cg_damping
        self.vf_stepsize = par.vf_stepsize
        self.vf_iters = par.vf_iters
        self.max_timesteps = par.max_timesteps
        self.max_episodes = par.max_episodes
        self.max_iters = par.max_iters
        self.callback = par.callback,  # you can do anything in the callback, since it takes locals(), globals()

        self.nworkers = MPI.COMM_WORLD.Get_size()
        self.rank = MPI.COMM_WORLD.Get_rank()
        np.set_printoptions(precision=3)
        # Setup losses and stuff
        # ----------------------------------------
        self.ob_space = self.env.observation_space
        self.ac_space = self.env.action_space
        self.pi = policy_func(a_name, self.ob_space, self.ac_space)
        self.oldpi = policy_func("oldpi" + a_name, self.ob_space,
                                 self.ac_space)
        self.atarg = tf.placeholder(
            dtype=tf.float32,
            shape=[None])  # Target advantage function (if applicable)
        self.ret = tf.placeholder(dtype=tf.float32,
                                  shape=[None])  # Empirical return

        self.ob = U.get_placeholder_cached(name="ob" +
                                           str(TRPO_agent_new.index2))
        self.ac = self.pi.pdtype.sample_placeholder([None])

        self.kloldnew = self.oldpi.pd.kl(self.pi.pd)
        self.ent = self.pi.pd.entropy()
        meankl = U.mean(self.kloldnew)
        meanent = U.mean(self.ent)
        entbonus = self.entcoeff * meanent

        self.vferr = U.mean(tf.square(self.pi.vpred - self.ret))

        ratio = tf.exp(self.pi.pd.logp(self.ac) -
                       self.oldpi.pd.logp(self.ac))  # advantage * pnew / pold
        surrgain = U.mean(ratio * self.atarg)

        optimgain = surrgain + entbonus
        self.losses = [optimgain, meankl, entbonus, surrgain, meanent]
        self.loss_names = [
            "optimgain", "meankl", "entloss", "surrgain", "entropy"
        ]

        self.dist = meankl

        all_var_list = self.pi.get_trainable_variables()

        var_list = [
            v for v in all_var_list if v.name.split("/")[1].startswith("pol")
        ]
        vf_var_list = [
            v for v in all_var_list if v.name.split("/")[1].startswith("vf")
        ]
        self.vfadam = MpiAdam(vf_var_list)

        self.get_flat = U.GetFlat(var_list)
        self.set_from_flat = U.SetFromFlat(var_list)
        self.klgrads = tf.gradients(self.dist, var_list)
        self.flat_tangent = tf.placeholder(dtype=tf.float32,
                                           shape=[None],
                                           name="flat_tan" +
                                           str(TRPO_agent_new.index2))

        shapes = [var.get_shape().as_list() for var in var_list]
        start = 0
        self.tangents = []
        for shape in shapes:
            sz = U.intprod(shape)
            self.tangents.append(
                tf.reshape(self.flat_tangent[start:start + sz], shape))
            start += sz

        self.gvp = tf.add_n([
            U.sum(g * tangent)
            for (g, tangent) in zipsame(self.klgrads, self.tangents)
        ])  #pylint: disable=E1111
        self.fvp = U.flatgrad(self.gvp, var_list)

        self.assign_old_eq_new = U.function(
            [], [],
            updates=[
                tf.assign(oldv, newv) for (oldv, newv) in zipsame(
                    self.oldpi.get_variables(), self.pi.get_variables())
            ])

        self.compute_losses = U.function([self.ob, self.ac, self.atarg],
                                         self.losses)
        self.compute_lossandgrad = U.function(
            [self.ob, self.ac, self.atarg],
            self.losses + [U.flatgrad(optimgain, var_list)])
        self.compute_fvp = U.function(
            [self.flat_tangent, self.ob, self.ac, self.atarg], self.fvp)
        self.compute_vflossandgrad = U.function([self.ob, self.ret],
                                                U.flatgrad(
                                                    self.vferr, vf_var_list))

        TRPO_agent_new.index2 += 1
        U.initialize()
        self.th_init = self.get_flat()
        MPI.COMM_WORLD.Bcast(self.th_init, root=0)
        self.set_from_flat(self.th_init)
        self.vfadam.sync()
        print("Init param sum", self.th_init.sum(), flush=True)
示例#29
0
def learn(
        *,
        network,
        env,
        total_timesteps,
        timesteps_per_batch=1024,  # what to train on
        max_kl=0.001,
        cg_iters=10,
        gamma=0.99,
        lam=1.0,  # advantage estimation
        seed=None,
        ent_coef=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        load_path=None,
        **network_kwargs):
    '''
    learn a policy function with TRPO algorithm

    Parameters:
    ----------

    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets

    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class

    timesteps_per_batch     timesteps per gradient estimation batch

    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )

    ent_coef                coefficient of policy entropy term in the optimization objective

    cg_iters                number of iterations of conjugate gradient algorithm

    cg_damping              conjugate gradient damping

    vf_stepsize             learning rate for adam optimizer used to optimie value function loss

    vf_iters                number of iterations of value function optimization iterations per each policy optimization step

    total_timesteps           max number of timesteps

    max_episodes            max number of episodes

    max_iters               maximum number of policy optimization iterations

    callback                function to be called with (locals(), globals()) each policy optimization step

    load_path               str, path to load the model from (default: None, i.e. no model is loaded)

    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network

    Returns:
    -------

    learnt model

    '''

    if MPI is not None:
        nworkers = MPI.COMM_WORLD.Get_size()
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        nworkers = 1
        rank = 0

    cpus_per_worker = 1
    U.get_session(
        config=tf.ConfigProto(allow_soft_placement=True,
                              inter_op_parallelism_threads=cpus_per_worker,
                              intra_op_parallelism_threads=cpus_per_worker))

    policy = build_policy(env, network, value_network='copy', **network_kwargs)
    set_global_seeds(seed)

    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    ob = observation_placeholder(ob_space)
    with tf.variable_scope("pi"):
        pi = policy(observ_placeholder=ob)
    with tf.variable_scope("oldpi"):
        oldpi = policy(observ_placeholder=ob)

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = ent_coef * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = get_trainable_variables("pi")
    # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables("oldpi"), get_variables("pi"))
        ])

    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        if MPI is not None:
            out = np.empty_like(x)
            MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
            out /= nworkers
        else:
            out = np.copy(x)

        return out

    U.initialize()
    if load_path is not None:
        pi.load(load_path)

    th_init = get_flat()
    if MPI is not None:
        MPI.COMM_WORLD.Bcast(th_init, root=0)

    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0:
        # noththing to be done
        return pi

    assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \
        'out of max_iters, total_timesteps, and max_episodes only one should be specified'

    while True:
        if callback: callback(locals(), globals())
        if total_timesteps and timesteps_so_far >= total_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        args = seg["ob"], seg["ac"], atarg
        fvpargs = [arr[::5] for arr in args]

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        assign_old_eq_new()  # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product,
                             g,
                             cg_iters=cg_iters,
                             verbose=rank == 0)
            assert np.isfinite(stepdir).all()
            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                meanlosses = surr, kl, *_ = allmean(
                    np.array(compute_losses(*args)))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f" %
                           (expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather(
                    (thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                assert all(
                    np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)

        with timed("vf"):

            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (seg["ob"], seg["tdlamret"]),
                        include_final_partial_batch=False,
                        batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        if MPI is not None:
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        else:
            listoflrpairs = [lrlocal]

        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()

    return pi
示例#30
0
def learn(
        *,
        network,
        env,
        seed=None,
        beta,
        total_timesteps,
        sil_update,
        sil_loss,
        timesteps_per_batch=2048,  # what to train on
        epsilon=0.01,
        cg_iters=10,
        gamma=0.99,
        lam=0.98,  # advantage estimation
        entcoeff=0.0,
        lr=3e-4,
        cg_damping=0.1,
        vf_stepsize=1e-3,
        vf_iters=5,
        sil_value=0.01,
        sil_alpha=0.6,
        sil_beta=0.1,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        save_interval=0,
        load_path=None,
        model_fn=None,
        update_fn=None,
        init_fn=None,
        mpi_rank_weight=1,
        comm=None,
        vf_coef=0.5,
        max_grad_norm=0.5,
        log_interval=1,
        nminibatches=4,
        noptepochs=4,
        cliprange=0.2,
        TRPO=False,
        **network_kwargs):

    set_global_seeds(seed)
    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()

    policy = build_policy(env,
                          network,
                          value_network='copy',
                          copos=True,
                          **network_kwargs)
    nenvs = env.num_envs
    np.set_printoptions(precision=3)

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * timesteps_per_batch
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)
    if model_fn is None:
        model_fn = Model
    discrete_ac_space = isinstance(ac_space, gym.spaces.Discrete)

    ob = observation_placeholder(ob_space)
    with tf.variable_scope("pi", reuse=tf.AUTO_REUSE):
        pi = policy(observ_placeholder=ob)
        #sil_model=policy(None, None, sess=get_session)
        make_model = lambda: Model(
            policy=policy,
            ob_space=ob_space,
            ac_space=ac_space,
            nbatch_act=nenvs,
            nbatch_train=nbatch_train,
            nsteps=timesteps_per_batch,
            ent_coef=entcoeff,
            vf_coef=vf_coef,
            max_grad_norm=max_grad_norm,
            sil_update=sil_update,
            sil_value=sil_value,
            sil_alpha=sil_alpha,
            sil_beta=sil_beta,
            sil_loss=sil_loss,
            #                                    fn_reward=env.process_reward,
            fn_reward=None,
            #                                    fn_obs=env.process_obs,
            fn_obs=None,
            ppo=False,
            prev_pi='pi',
            silm=pi)
        model = make_model()
        if load_path is not None:
            model.load(load_path)
    with tf.variable_scope("oldpi", reuse=tf.AUTO_REUSE):
        oldpi = policy(observ_placeholder=ob)
        make_old_model = lambda: Model(
            policy=policy,
            ob_space=ob_space,
            ac_space=ac_space,
            nbatch_act=nenvs,
            nbatch_train=nbatch_train,
            nsteps=timesteps_per_batch,
            ent_coef=entcoeff,
            vf_coef=vf_coef,
            max_grad_norm=max_grad_norm,
            sil_update=sil_update,
            sil_value=sil_value,
            sil_alpha=sil_alpha,
            sil_beta=sil_beta,
            sil_loss=sil_loss,
            #                                    fn_reward=env.process_reward,
            fn_reward=None,
            #                                    fn_obs=env.process_obs,
            fn_obs=None,
            ppo=False,
            prev_pi='oldpi',
            silm=oldpi)
        old_model = make_old_model()

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    old_entropy = oldpi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = entcoeff * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "Entropy"]

    dist = meankl

    #all_var_list = pi.get_trainable_variables()
    #all_var_list = [v for v in all_var_list if v.name.split("/")[0].startswith("pi")]
    #var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    #vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]

    all_var_list = get_trainable_variables("pi")
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    #????gvp and fvp???
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables("oldpi"), get_variables("pi"))
        ])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    if load_path is not None:
        pi.load(load_path)
    th_init = get_flat()
    if MPI is not None:
        MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Initialize eta, omega optimizer
    if discrete_ac_space:
        init_eta = 1
        init_omega = 0.5
        eta_omega_optimizer = EtaOmegaOptimizerDiscrete(
            beta, epsilon, init_eta, init_omega)
    else:
        init_eta = 0.5
        init_omega = 2.0
        #????eta_omega_optimizer details?????
        eta_omega_optimizer = EtaOmegaOptimizer(beta, epsilon, init_eta,
                                                init_omega)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(env,
                                     timesteps_per_batch,
                                     model,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    assert sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 1

    while True:
        if callback: callback(locals(), globals())
        if total_timesteps and timesteps_so_far >= total_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        model = seg["model"]
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        #print(ob[:20])
        #print(ac[:20])

        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "rms"):
            pi.rms.update(ob)  # update running mean/std for policy

        args = seg["ob"], seg["ac"], atarg
        fvpargs = [arr[::5] for arr in args]

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        assign_old_eq_new()  # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product,
                             g,
                             cg_iters=cg_iters,
                             verbose=rank == 0)
            assert np.isfinite(stepdir).all()

            if TRPO:
                #
                # TRPO specific code.
                # Find correct step size using line search
                #
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / epsilon)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(
                        np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" %
                               (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > epsilon * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
            else:
                #
                # COPOS specific implementation.
                #
                copos_update_dir = stepdir

                # Split direction into log-linear 'w_theta' and non-linear 'w_beta' parts
                w_theta, w_beta = pi.split_w(copos_update_dir)

                tmp_ob = np.zeros(
                    (1, ) + env.observation_space.shape
                )  # We assume that entropy does not depend on the NN

                # Optimize eta and omega
                if discrete_ac_space:
                    entropy = lossbefore[4]
                    #entropy = - 1/timesteps_per_batch * np.sum(np.sum(pi.get_action_prob(ob) * pi.get_log_action_prob(ob), axis=1))
                    eta, omega = eta_omega_optimizer.optimize(
                        pi.compute_F_w(ob, copos_update_dir),
                        pi.get_log_action_prob(ob), timesteps_per_batch,
                        entropy)
                else:
                    Waa, Wsa = pi.w2W(w_theta)
                    wa = pi.get_wa(ob, w_beta)

                    varphis = pi.get_varphis(ob)

                    #old_ent = old_entropy.eval({oldpi.ob: tmp_ob})[0]
                    old_ent = lossbefore[4]
                    eta, omega = eta_omega_optimizer.optimize(
                        w_theta, Waa, Wsa, wa, varphis, pi.get_kt(),
                        pi.get_prec_matrix(), pi.is_new_policy_valid, old_ent)
                logger.log("Initial eta: " + str(eta) + " and omega: " +
                           str(omega))

                current_theta_beta = get_flat()
                prev_theta, prev_beta = pi.all_to_theta_beta(
                    current_theta_beta)

                if discrete_ac_space:
                    # Do a line search for both theta and beta parameters by adjusting only eta
                    eta = eta_search(w_theta, w_beta, eta, omega, allmean,
                                     compute_losses, get_flat, set_from_flat,
                                     pi, epsilon, args, discrete_ac_space)
                    logger.log("Updated eta, eta: " + str(eta))
                    set_from_flat(pi.theta_beta_to_all(prev_theta, prev_beta))
                    # Find proper omega for new eta. Use old policy parameters first.
                    eta, omega = eta_omega_optimizer.optimize(
                        pi.compute_F_w(ob, copos_update_dir),
                        pi.get_log_action_prob(ob), timesteps_per_batch,
                        entropy, eta)
                    logger.log("Updated omega, eta: " + str(eta) +
                               " and omega: " + str(omega))

                    # do line search for ratio for non-linear "beta" parameter values
                    #ratio = beta_ratio_line_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi,
                    #                     epsilon, beta, args)
                    # set ratio to 1 if we do not use beta ratio line search
                    ratio = 1
                    #print("ratio from line search: " + str(ratio))
                    cur_theta = (eta * prev_theta +
                                 w_theta.reshape(-1, )) / (eta + omega)
                    cur_beta = prev_beta + ratio * w_beta.reshape(-1, ) / eta
                else:
                    for i in range(2):
                        # Do a line search for both theta and beta parameters by adjusting only eta
                        eta = eta_search(w_theta, w_beta, eta, omega, allmean,
                                         compute_losses, get_flat,
                                         set_from_flat, pi, epsilon, args)
                        logger.log("Updated eta, eta: " + str(eta) +
                                   " and omega: " + str(omega))

                        # Find proper omega for new eta. Use old policy parameters first.
                        set_from_flat(
                            pi.theta_beta_to_all(prev_theta, prev_beta))
                        eta, omega = \
                            eta_omega_optimizer.optimize(w_theta, Waa, Wsa, wa, varphis, pi.get_kt(),
                                                         pi.get_prec_matrix(), pi.is_new_policy_valid, old_ent, eta)
                        logger.log("Updated omega, eta: " + str(eta) +
                                   " and omega: " + str(omega))

                    # Use final policy
                    logger.log("Final eta: " + str(eta) + " and omega: " +
                               str(omega))
                    cur_theta = (eta * prev_theta +
                                 w_theta.reshape(-1, )) / (eta + omega)
                    cur_beta = prev_beta + w_beta.reshape(-1, ) / eta

                set_from_flat(pi.theta_beta_to_all(cur_theta, cur_beta))

                meanlosses = surr, kl, *_ = allmean(
                    np.array(compute_losses(*args)))
##copos specific over
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather(
                    (thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                assert all(
                    np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
#cg over
        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)


#policy update over
        with timed("vf"):
            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (seg["ob"], seg["tdlamret"]),
                        include_final_partial_batch=False,
                        batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)
        with timed('SIL'):
            lrnow = lr(1.0 - timesteps_so_far / total_timesteps)
            l_loss, sil_adv, sil_samples, sil_nlogp = model.sil_train(lrnow)

        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        print("Reward max: " + str(max(rewbuffer)))
        print("Reward min: " + str(min(rewbuffer)))

        logger.record_tabular(
            "EpLenMean",
            np.mean(lenbuffer) if np.sum(lenbuffer) != 0.0 else 0.0)
        logger.record_tabular(
            "EpRewMean",
            np.mean(rewbuffer) if np.sum(rewbuffer) != 0.0 else 0.0)
        logger.record_tabular(
            "AverageReturn",
            np.mean(rewbuffer) if np.sum(rewbuffer) != 0.0 else 0.0)
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if sil_update > 0:
            logger.record_tabular("SilSamples", sil_samples)

        if rank == 0:
            logger.dump_tabular()