def train(env_id, num_timesteps, seed, num_cpu, batch, lr):
    from rl.common import set_global_seeds
    from rl.common.vec_env.vec_normalize import MAVecNormalize
    from rl.common.ma_wrappers import MAWrapper
    from sandbox.mppo import ppo2
    from sandbox.mppo.policies import MlpPolicy
    import gym
    import tensorflow as tf
    from rl.common.vec_env.subproc_vec_env import SubprocVecEnv
    ncpu = 1
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()

    def _make_env():
        env = gym.make(env_id)
        env = MAWrapper(env)
        env = bench.Monitor(env, logger.get_dir())
        return env

    env = SubprocVecEnv([_make_env for _ in range(num_cpu)], is_multi_agent=True)
    env = MAVecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
    ppo2.learn(policy=policy, env=env, nsteps=batch // num_cpu, nminibatches=32,
        lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
        ent_coef=0.0,
        lr=lr,
        cliprange=0.2,
        total_timesteps=num_timesteps)
示例#2
0
def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed,
          num_cpu):
    def create_env(rank):
        def _thunk():
            env = make_env.make_env(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(env,
                                logger.get_dir()
                                and os.path.join(logger.get_dir(), str(rank)),
                                allow_early_resets=True)
            gym.logger.setLevel(logging.WARN)
            return env

        return _thunk

    logger.configure(logdir,
                     format_strs=['stdout', 'log', 'json', 'tensorboard'])

    set_global_seeds(seed)
    env = SubprocVecEnv([create_env(i) for i in range(num_cpu)],
                        is_multi_agent=True)
    policy_fn = CategoricalPolicy
    learn(policy_fn,
          env,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          nprocs=num_cpu,
          nsteps=timesteps_per_batch // num_cpu,
          lr=lr,
          ent_coef=0.00,
          identical=make_env.get_identical(env_id))
    env.close()
示例#3
0
def train(env_id, num_timesteps, seed, num_cpu, batch, lr):
    from rl.common import set_global_seeds
    from rl.common.vec_env.vec_normalize import MAVecNormalize
    from rl.common.ma_wrappers import MAWrapper
    from sandbox.mppo import ppo2
    from sandbox.mppo.policies import MlpPolicy
    import gym
    import tensorflow as tf
    from rl.common.vec_env.subproc_vec_env import SubprocVecEnv
    ncpu = 1
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()

    def _make_env(rank):
        env = gym.make('RoboSumo-Ant-vs-Ant-v0')
        env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
        return env

    env = SubprocVecEnv([lambda: _make_env(i) for i in range(num_cpu)], is_multi_agent=True)
    env = MAVecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
    expert = MADataSet('/atlas/u/tsong/Projects/imitation/ant-vs-ant.pkl')
    ppo2.learn(policy=policy, env=env, nsteps=batch // num_cpu, nminibatches=160,
        lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
        ent_coef=0.0,
        lr=lr,
        cliprange=0.2,
        total_timesteps=num_timesteps, expert=expert, clone_iters=1000)
示例#4
0
def train(logdir,
          env_id,
          num_timesteps,
          lr,
          timesteps_per_batch,
          seed,
          num_cpu,
          expert_path,
          traj_limitation,
          ret_threshold,
          dis_lr,
          disc_type='decentralized',
          bc_iters=500,
          l2=0.1,
          d_iters=1,
          rew_scale=0.1):
    def create_env(rank):
        def _thunk():
            env = make_env.make_env(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(env,
                                logger.get_dir()
                                and os.path.join(logger.get_dir(), str(rank)),
                                allow_early_resets=True)
            gym.logger.setLevel(logging.WARN)
            return env

        return _thunk

    logger.configure(logdir,
                     format_strs=['stdout', 'log', 'json', 'tensorboard'])

    set_global_seeds(seed)
    env = SubprocVecEnv([create_env(i) for i in range(num_cpu)],
                        is_multi_agent=True)
    print(num_cpu)
    policy_fn = CategoricalPolicy
    expert = MADataSet(expert_path,
                       ret_threshold=ret_threshold,
                       traj_limitation=traj_limitation,
                       nobs_flag=True)
    learn(policy_fn,
          expert,
          env,
          env_id,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          nprocs=num_cpu,
          nsteps=timesteps_per_batch // num_cpu,
          lr=lr,
          ent_coef=0.0,
          dis_lr=dis_lr,
          disc_type=disc_type,
          bc_iters=bc_iters,
          identical=make_env.get_identical(env_id),
          l2=l2,
          d_iters=d_iters,
          rew_scale=rew_scale)
    env.close()
def make_gym_env(env_id, num_env=2, seed=123, wrapper_kwargs=None, start_index=0):
    """
    Create a wrapped, SubprocVecEnv for Gym Environments.
    """
    if wrapper_kwargs is None: wrapper_kwargs = {}
    def make_env(rank): # pylint: disable=C0111
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            return env
        return _thunk
    set_global_seeds(seed)
    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
示例#6
0
def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.95, lam=0.92, log_interval=1, nprocs=32, nsteps=20,
          nstack=1, ent_coef=0.00, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
          kfac_clip=0.001, save_interval=100, lrschedule='linear', identical=None):
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
    =nsteps, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
                               vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
                               lrschedule=lrschedule, identical=identical)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()

    runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma, lam=lam)
    nbatch = nenvs * nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
    # enqueue_threads = [q_runner.create_threads(model.sess, coord=coord, start=True) for q_runner in model.q_runner]
    for update in range(1, total_timesteps // nbatch + 1):
        obs, states, rewards, masks, actions, values = runner.run()
        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
        model.old_obs = obs
        nseconds = time.time() - tstart
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            ev = [explained_variance(values[k], rewards[k]) for k in range(model.num_agents)]
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)

            for k in range(model.num_agents):
                # logger.record_tabular('reward %d' % k, np.mean(rewards[k]))
                logger.record_tabular("explained_variance %d" % k, float(ev[k]))
                logger.record_tabular("policy_entropy %d" % k, float(policy_entropy[k]))
                logger.record_tabular("policy_loss %d" % k, float(policy_loss[k]))
                logger.record_tabular("value_loss %d" % k, float(value_loss[k]))
            logger.dump_tabular()

        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
            savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
    coord.request_stop()
    # coord.join(enqueue_threads)
    env.close()
示例#7
0
def train(logdir,
          env_id,
          lr,
          num_timesteps,
          seed,
          timesteps_per_batch,
          cont=False):
    from sandbox.ppo_sgd import mlp_policy
    from sandbox.ppo_sgd import pposgd_simple
    from rl import logger
    from rl.common import set_global_seeds, tf_util as U
    from rl import bench

    from gym.envs.registration import register
    import multiagent
    import make_env

    logger.configure(logdir, format_strs=['log', 'json', 'tensorboard'])
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = make_env.make_env(env_id)

    def policy_fn(name, ob_space, ac_space, id):
        pi = mlp_policy.MlpPolicy(name=name,
                                  ob_space=ob_space,
                                  ac_space=ac_space,
                                  hid_size=64,
                                  num_hid_layers=2,
                                  id=id)
        return pi

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_batch=timesteps_per_batch,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=10,
                        optim_stepsize=lr,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear',
                        cont=cont)
    env.close()
    return None
示例#8
0
def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed,
          num_cpu, max_episode_len):
    def create_env(rank):
        def _thunk():
            env = make_env.make_env(env_id, max_episode_len=max_episode_len)
            env.discrete_action_input = True
            env.seed(seed + rank)
            env = bench.Monitor(env,
                                logger.get_dir()
                                and os.path.join(logger.get_dir(), str(rank)),
                                allow_early_resets=True)
            gym.logger.setLevel(logging.WARN)
            return env

        return _thunk

    logger.configure(logdir, format_strs=['json'])

    set_global_seeds(seed)
    env = SubprocVecEnv([create_env(i) for i in range(num_cpu)],
                        is_multi_agent=True)
    policy_fn = CategoricalPolicy
    learn(policy_fn,
          env,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          nprocs=num_cpu,
          nsteps=timesteps_per_batch // num_cpu,
          lr=lr,
          ent_coef=0.00,
          identical=make_env.get_identical(env_id),
          log_interval=50,
          save_interval=int(num_timesteps / timesteps_per_batch),
          max_episode_len=max_episode_len)
    logger.Logger.CURRENT.close()
    env.close()
def learn(
        *,
        network,
        env,
        eval_policy,
        total_timesteps,
        timesteps_per_batch=1024,  # what to train on
        max_kl=0.001,
        cg_iters=10,
        gamma=0.99,
        lam=1.0,  # advantage estimation
        seed=None,
        ent_coef=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        load_path=None,
        checkpoint_path_in=None,
        checkpoint_dir_out=None,
        checkpoint_freq=100,  # In iterations!!,
        from_iter=0,
        eval_episodes=20,
        **network_kwargs):
    '''
    learn a policy function with TRPO algorithm

    Parameters:
    ----------

    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets

    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class

    timesteps_per_batch     timesteps per gradient estimation batch

    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )

    ent_coef                coefficient of policy entropy term in the optimization objective

    cg_iters                number of iterations of conjugate gradient algorithm

    cg_damping              conjugate gradient damping

    vf_stepsize             learning rate for adam optimizer used to optimie value function loss

    vf_iters                number of iterations of value function optimization iterations per each policy optimization step

    total_timesteps           max number of timesteps

    max_episodes            max number of episodes

    max_iters               maximum number of policy optimization iterations

    callback                function to be called with (locals(), globals()) each policy optimization step

    load_path               str, path to load the model from (default: None, i.e. no model is loaded)

    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network

    Returns:
    -------

    learnt model

    '''

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()

    cpus_per_worker = 1
    U.get_session(
        config=tf.ConfigProto(allow_soft_placement=True,
                              inter_op_parallelism_threads=cpus_per_worker,
                              intra_op_parallelism_threads=cpus_per_worker))

    policy = build_policy(env, network, value_network='copy', **network_kwargs)

    set_global_seeds(seed)

    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    # ob_space = Box(low=-np.inf, high=np.inf, shape=(env.observation_space.n,))
    ob_space = env.observation_space
    ac_space = env.action_space

    ob = observation_placeholder(ob_space)
    with tf.variable_scope("pi"):
        pi = policy(observ_placeholder=ob)
    with tf.variable_scope("oldpi"):
        oldpi = policy(observ_placeholder=ob)
    # Loading checkpoint
    if checkpoint_path_in is not None and os.path.isfile(checkpoint_path_in):
        pi.load(checkpoint_path_in)
        logger.log('Loaded policy weights from %s' % checkpoint_path_in)

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = ent_coef * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = get_trainable_variables("pi")
    # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  # pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables("oldpi"), get_variables("pi"))
        ])

    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    # s = env.reset()
    # start = time.time()
    # for i in range(10000):
    #     pi.step(s, stochastic=True)
    # duration = time.time() - start
    # print(duration)
    # return
    if load_path is not None:
        pi.load(load_path)

    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     gamma=gamma)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    iters_eval = 0
    all_logs = []
    best_rew = -np.inf

    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    online_scores = []
    offline_scores = []
    if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0:
        # noththing to be done
        return pi

    assert sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) < 2, \
        'out of max_iters, total_timesteps, and max_episodes only one should be specified'

    while True:
        if callback: callback(locals(), globals())
        if total_timesteps and timesteps_so_far >= total_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        if iters_so_far % checkpoint_freq == 0 and checkpoint_dir_out is not None:
            if not os.path.exists(checkpoint_dir_out):
                os.makedirs(checkpoint_dir_out)
            pi.save(
                os.path.join(checkpoint_dir_out,
                             'checkpoint_%d' % iters_so_far))
            logger.log('Saved policy weights as %s' % os.path.join(
                checkpoint_dir_out, 'checkpoint_%d.npy' % iters_so_far))

            def pi_wrapper(ob):
                ac, vpred, _, _ = pi.step(ob, stochastic=True)
                return ac

            rew, _, logs, disc_rets, num_stops, avg_damages = eval_policy(
                pi=pi_wrapper, n_episodes=eval_episodes, verbose=True)
            offline_scores.append(
                [np.mean(disc_rets),
                 np.mean(num_stops),
                 np.mean(avg_damages)])
            np.save(os.path.join(checkpoint_dir_out, 'offline_scores.npy'),
                    offline_scores)
            for log in logs:
                log['iter'] = iters_eval
            all_logs = all_logs + logs

            iters_eval += 1

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate

        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        args = seg["ob"], seg["ac"], atarg
        fvpargs = [arr[::5] for arr in args]

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        assign_old_eq_new()  # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product,
                             g,
                             cg_iters=cg_iters,
                             verbose=rank == 0)
            assert np.isfinite(stepdir).all()
            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                meanlosses = surr, kl, *_ = allmean(
                    np.array(compute_losses(*args)))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f" %
                           (expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather(
                    (thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                assert all(
                    np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)

        with timed("vf"):

            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (seg["ob"], seg["tdlamret"]),
                        include_final_partial_batch=False,
                        batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        ep_rew_mean = np.mean(rewbuffer)
        online_scores.append(ep_rew_mean)
        np.save(os.path.join(checkpoint_dir_out, 'online_scores.npy'),
                online_scores)
        # Saving best
        if iters_so_far % checkpoint_freq == 0 and ep_rew_mean > best_rew and checkpoint_dir_out is not None:
            pi.save(os.path.join(checkpoint_dir_out, 'best'))
            best_rew = ep_rew_mean
            logger.log('Saved policy weights as %s' %
                       os.path.join(checkpoint_dir_out, 'best.npy'))

        if rank == 0:
            logger.dump_tabular()

    return pi
示例#10
0
def learn(policy,
          expert,
          env,
          env_id,
          seed,
          total_timesteps=int(40e6),
          gamma=0.99,
          lam=0.95,
          log_interval=1,
          nprocs=32,
          nsteps=20,
          nstack=1,
          ent_coef=0.01,
          vf_coef=0.5,
          vf_fisher_coef=1.0,
          lr=0.25,
          max_grad_norm=0.5,
          kfac_clip=0.001,
          save_interval=100,
          lrschedule='linear',
          dis_lr=0.001,
          disc_type='decentralized',
          bc_iters=500,
          identical=None,
          l2=0.1,
          d_iters=1,
          rew_scale=0.1):
    tf.reset_default_graph()
    set_global_seeds(seed)
    buffer = None

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    num_agents = (len(ob_space))
    make_model = lambda: Model(policy,
                               ob_space,
                               ac_space,
                               nenvs,
                               total_timesteps,
                               nprocs=nprocs,
                               nsteps=nsteps,
                               nstack=nstack,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               vf_fisher_coef=vf_fisher_coef,
                               lr=lr,
                               max_grad_norm=max_grad_norm,
                               kfac_clip=kfac_clip,
                               lrschedule=lrschedule,
                               identical=identical)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()
    if disc_type == 'decentralized' or disc_type == 'decentralized-all':
        discriminator = [
            Discriminator(
                model.sess,
                ob_space,
                ac_space,
                state_only=True,
                discount=gamma,
                nstack=nstack,
                index=k,
                disc_type=disc_type,
                scope="Discriminator_%d" % k,  # gp_coef=gp_coef,
                total_steps=total_timesteps // (nprocs * nsteps),
                lr_rate=dis_lr,
                l2_loss_ratio=l2) for k in range(num_agents)
        ]
    else:
        assert False

    # add reward regularization
    if env_id == 'simple_tag':
        reward_reg_loss = tf.reduce_mean(
            tf.square(discriminator[0].reward + discriminator[3].reward) +
            tf.square(discriminator[1].reward + discriminator[3].reward) +
            tf.square(discriminator[2].reward +
                      discriminator[3].reward)) + rew_scale * tf.reduce_mean(
                          tf.maximum(0.0, 1 - discriminator[0].reward) +
                          tf.maximum(0.0, 1 - discriminator[1].reward) +
                          tf.maximum(0.0, 1 - discriminator[2].reward) +
                          tf.maximum(0.0, discriminator[3].reward + 1))
        reward_reg_lr = tf.placeholder(tf.float32, ())
        reward_reg_optim = tf.train.AdamOptimizer(learning_rate=reward_reg_lr)
        reward_reg_train_op = reward_reg_optim.minimize(reward_reg_loss)

    tf.global_variables_initializer().run(session=model.sess)
    runner = Runner(env,
                    model,
                    discriminator,
                    nsteps=nsteps,
                    nstack=nstack,
                    gamma=gamma,
                    lam=lam,
                    disc_type=disc_type,
                    nobs_flag=True)
    nbatch = nenvs * nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
    # enqueue_threads = [q_runner.create_threads(model.sess, coord=coord, start=True) for q_runner in model.q_runner]
    for _ in range(bc_iters):
        e_obs, e_actions, e_nobs, _, _ = expert.get_next_batch(nenvs * nsteps)
        e_a = [np.argmax(e_actions[k], axis=1) for k in range(len(e_actions))]
        lld_loss = model.clone(e_obs, e_a)
        # print(lld_loss)

    update_policy_until = 10

    for update in range(1, total_timesteps // nbatch + 1):
        obs, obs_next, states, rewards, report_rewards, masks, actions, values, all_obs, all_nobs,\
        mh_actions, mh_all_actions, mh_rewards, mh_true_rewards, mh_true_returns = runner.run()

        total_loss = np.zeros((num_agents, d_iters))

        idx = 0
        idxs = np.arange(len(all_obs))
        random.shuffle(idxs)
        all_obs = all_obs[idxs]
        mh_actions = [mh_actions[k][idxs] for k in range(num_agents)]
        mh_obs = [obs[k][idxs] for k in range(num_agents)]
        mh_obs_next = [obs_next[k][idxs] for k in range(num_agents)]
        mh_values = [values[k][idxs] for k in range(num_agents)]

        if buffer:
            buffer.update(mh_obs, mh_actions, mh_obs_next, all_obs, mh_values)
        else:
            buffer = Dset(mh_obs,
                          mh_actions,
                          mh_obs_next,
                          all_obs,
                          mh_values,
                          randomize=True,
                          num_agents=num_agents,
                          nobs_flag=True)

        d_minibatch = nenvs * nsteps

        for d_iter in range(d_iters):
            e_obs, e_actions, e_nobs, e_all_obs, _ = expert.get_next_batch(
                d_minibatch)
            g_obs, g_actions, g_nobs, g_all_obs, _ = buffer.get_next_batch(
                batch_size=d_minibatch)

            e_a = [
                np.argmax(e_actions[k], axis=1) for k in range(len(e_actions))
            ]
            g_a = [
                np.argmax(g_actions[k], axis=1) for k in range(len(g_actions))
            ]

            g_log_prob = model.get_log_action_prob(g_obs, g_a)
            e_log_prob = model.get_log_action_prob(e_obs, e_a)
            if disc_type == 'decentralized':
                for k in range(num_agents):
                    total_loss[k, d_iter] = discriminator[k].train(
                        g_obs[k], g_actions[k], g_nobs[k],
                        g_log_prob[k].reshape([-1, 1]), e_obs[k], e_actions[k],
                        e_nobs[k], e_log_prob[k].reshape([-1, 1]))
            elif disc_type == 'decentralized-all':
                g_obs_all = np.concatenate(g_obs, axis=1)
                g_actions_all = np.concatenate(g_actions, axis=1)
                g_nobs_all = np.concatenate(g_nobs, axis=1)
                e_obs_all = np.concatenate(e_obs, axis=1)
                e_actions_all = np.concatenate(e_actions, axis=1)
                e_nobs_all = np.concatenate(e_nobs, axis=1)
                for k in range(num_agents):
                    total_loss[k, d_iter] = discriminator[k].train(
                        g_obs_all, g_actions_all, g_nobs_all,
                        g_log_prob[k].reshape([-1,
                                               1]), e_obs_all, e_actions_all,
                        e_nobs_all, e_log_prob[k].reshape([-1, 1]))
            else:
                assert False

            if env_id == 'simple_tag':
                if disc_type == 'decentralized':
                    feed_dict = {
                        discriminator[k].obs:
                        np.concatenate([g_obs[k], e_obs[k]], axis=0)
                        for k in range(num_agents)
                    }
                elif disc_type == 'decentralized-all':
                    feed_dict = {
                        discriminator[k].obs:
                        np.concatenate([g_obs_all, e_obs_all], axis=0)
                        for k in range(num_agents)
                    }
                else:
                    assert False
                feed_dict[reward_reg_lr] = discriminator[0].lr.value()
                model.sess.run(reward_reg_train_op, feed_dict=feed_dict)

            idx += 1

        if update > update_policy_until:  # 10
            policy_loss, value_loss, policy_entropy = model.train(
                obs, states, rewards, masks, actions, values)
        model.old_obs = obs
        nseconds = time.time() - tstart
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            ev = [
                explained_variance(values[k], rewards[k])
                for k in range(model.num_agents)
            ]
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)

            for k in range(model.num_agents):
                logger.record_tabular("explained_variance %d" % k,
                                      float(ev[k]))
                if update > update_policy_until:
                    logger.record_tabular("policy_entropy %d" % k,
                                          float(policy_entropy[k]))
                    logger.record_tabular("policy_loss %d" % k,
                                          float(policy_loss[k]))
                    logger.record_tabular("value_loss %d" % k,
                                          float(value_loss[k]))
                    try:
                        logger.record_tabular(
                            'pearson %d' % k,
                            float(
                                pearsonr(report_rewards[k].flatten(),
                                         mh_true_returns[k].flatten())[0]))
                        logger.record_tabular(
                            'spearman %d' % k,
                            float(
                                spearmanr(report_rewards[k].flatten(),
                                          mh_true_returns[k].flatten())[0]))
                        logger.record_tabular('reward %d' % k,
                                              float(np.mean(rewards[k])))
                    except:
                        pass

            total_loss_m = np.mean(total_loss, axis=1)
            for k in range(num_agents):
                logger.record_tabular("total_loss %d" % k, total_loss_m[k])
            logger.dump_tabular()

        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir():
            savepath = osp.join(logger.get_dir(), 'm_%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
            if disc_type == 'decentralized' or disc_type == 'decentralized-all':
                for k in range(num_agents):
                    savepath = osp.join(logger.get_dir(),
                                        'd_%d_%.5i' % (k, update))
                    discriminator[k].save(savepath)
            else:
                assert False
    coord.request_stop()
    # coord.join(enqueue_threads)
    env.close()
示例#11
0
        env = gym.make('{}NoFrameskip-v4'.format(env_id))
        env.seed(seed + rank)
        return env
        return wrap_deepmind(env)
    return env_fn

if 1:
    from rl_algs.common.vec_env.mpi_vec_env1 import MpiVecEnv
    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    nenvs = comm.Get_size()
    env = make_env(comm.Get_rank())()
    env = MpiVecEnv(env, comm)
    A = np.array([env.action_space.sample() for _ in range(env.num_envs)])*0
elif 1:
    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(nenvs)])
    A = np.array([env.action_space.sample() for _ in range(env.num_envs)])*0
else:
    env = make_env(0)()
    A = env.action_space.sample()*0
    env.num_envs = 1

env.reset()

nsteps = 1000
tstart = time.time()
blah = 0
for _ in range(nsteps):
    ob,rew,done,_ = env.step(A)
    for q in (ob, rew, done):
示例#12
0
import numpy as np

from rl.a2c.a2c import Model
from rl.a2c.runner import Runner
from rl.common import set_global_seeds

from baselines.stochastic import Model as Stochastic
from baselines.rule import Model as Rule

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt


if __name__ == '__main__':
    set_global_seeds(args.seed)

    logger = get_logger('trading')
    logger.info(str(args))

    env = Env('train')

    # Instantiate the model objects (that creates defender_model and adversary_model)
    model = Model(
        ob_size=env.ob_size,
        act_size=env.act_size,
        learning_rate=args.lr,
        latents=args.latents,
        activation=args.activation,
        optimizer=args.optimizer,
        vf_coef=args.vf_coef,