示例#1
0
def test(config, env):
    ob_space = env.observation_space
    ac_space = env.action_space
    tf.reset_default_graph()
    gpu_opts = tf.GPUOptions(allow_growth=True)
    tf_config = tf.ConfigProto(
        inter_op_parallelism_threads=1,
        intra_op_parallelism_threads=1,
        gpu_options=gpu_opts,
    )
    with tf.Session(config=tf_config) as sess:
        nenvs = env.num_envs
        nbatch = nenvs * config.number_of_steps
        nbatch_train = nbatch // 4
        policy = build_policy(env, 'cnn')
        model = Model(
            policy=policy,
            ob_space=ob_space,
            ac_space=ac_space,
            nbatch_act=nenvs,
            nbatch_train=nbatch_train,
            nsteps=config.number_of_steps,
            ent_coef=config.entropy_weight,
            vf_coef=config.critic_weight,
            max_grad_norm=config.max_grad_norm,
            comm=None,
            mpi_rank_weight=1
        )
        model.load(config.load_path)
        return make_rollouts(config, env, model)
示例#2
0
    def __init__(self, env, env_type, stochastic):
        """
        The constructor that uses the environment to constuct the network build policy and then build the agent.

        Parameters
        ----------
        env : gym.env
            The env the agent needs to interact with.
        env_type : str
            The type of env.
        stochastic : bool
            A bool describing if the behavior of the agent is stochastic (random in simple terms).
        """
        ob_space = env.observation_space
        ac_space = env.action_space
        self.stochastic = stochastic

        #now find the correct build policy
        if env_type == 'atari':
            policy = build_policy(env, 'cnn')
        elif env_type == "ChessWrapper":
            policy = build_policy(env, 'mlp', {'num_layers':5})
        else:
            policy = build_policy(env, 'mlp')

        #construct the agent model using the build model
        make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1, nbatch_train=1,
                                   nsteps=1, ent_coef=0., vf_coef=0.,
                                   max_grad_norm=0.)
        self.model = make_model()
示例#3
0
def demonstrate(network,
                env,
                nsteps,
                mvs,
                load_path,
                ent_coef=0.0,
                vf_coef=0.5,
                max_grad_norm=0.5,
                mpi_rank_weight=1,
                comm=None,
                gamma=0.99,
                lam=0.95):

    policy = build_policy(env, network)

    model = Model(policy=policy,
                  nbatch_act=1,
                  nbatch_train=None,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm,
                  comm=comm,
                  mpi_rank_weight=mpi_rank_weight)

    if load_path is not None:
        model.load(load_path)
        print('Model has been successfully loaded from {0}'.format(load_path))
    else:
        print(
            'No model has been loaded. Neural network with random weights is used.'
        )

    # Instantiate the runner object and episode buffer

    runner = Runner(env=env,
                    model=model,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    mvs=mvs)
    obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        render=True)

    print('Demo completed! Reward: {0}'.format(epinfos[0]['r']))
    print('\nPress Ctrl+C to stop the demo...')
示例#4
0
    def __init__(self, env, env_type, path, stochastic=False, gpu=True):
        from baselines.common.policies import build_policy
        from baselines.ppo2.model import Model

        self.graph = tf.Graph()

        if gpu:
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
        else:
            config = tf.ConfigProto(device_count={'GPU': 0})

        self.sess = tf.Session(graph=self.graph, config=config)

        with self.graph.as_default():
            with self.sess.as_default():
                if isinstance(env.observation_space, gym.spaces.Dict):
                    ob_space = env.observation_space.spaces['ob_flattened']
                else:
                    ob_space = env.observation_space
                ac_space = env.action_space

                if env_type == 'atari':
                    policy = build_policy(env, 'cnn')
                elif env_type in ['mujoco', 'robosuite']:
                    policy = build_policy(env, 'mlp')
                else:
                    assert False, ' not supported env_type'

                make_model = lambda: Model(policy=policy,
                                           ob_space=ob_space,
                                           ac_space=ac_space,
                                           nbatch_act=1,
                                           nbatch_train=1,
                                           nsteps=1,
                                           ent_coef=0.,
                                           vf_coef=0.,
                                           max_grad_norm=0.)
                self.model = make_model()

                self.model_path = path
                self.model.load(path)

        if env_type in ['mujoco', 'robosuite']:
            with open(path + '.env_stat.pkl', 'rb') as f:
                import pickle
                s = pickle.load(f)
            self.ob_rms = s['ob_rms']
            #self.ret_rms = s['ret_rms']
            self.clipob = 10.
            self.epsilon = 1e-8
        else:
            self.ob_rms = None

        self.stochastic = stochastic
示例#5
0
    def __init__(self, env, env_type, stochastic=False):
        ob_space = env.observation_space
        ac_space = env.action_space

        if env_type == 'atari':
            policy = build_policy(env, 'cnn')
        elif env_type == 'mujoco':
            policy = build_policy(env, 'mlp')

        make_model = lambda: Model(policy=policy,
                                   ob_space=ob_space,
                                   ac_space=ac_space,
                                   nbatch_act=1,
                                   nbatch_train=1,
                                   nsteps=1,
                                   ent_coef=0.,
                                   vf_coef=0.,
                                   max_grad_norm=0.)
        self.model = make_model()
        self.stochastic = stochastic
示例#6
0
    def __init__(self, env, env_type, path, stochastic=False, gpu=True):
        from baselines.common.policies import build_policy
        from baselines.ppo2.model import Model

        self.graph = tf.Graph()

        if gpu:
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
        else:
            config = tf.ConfigProto(device_count={'GPU': 0})

        self.sess = tf.Session(graph=self.graph, config=config)

        with self.graph.as_default():
            with self.sess.as_default():
                ob_space = env.observation_space
                ac_space = env.action_space

                if env_type == 'atari':
                    policy = build_policy(env, 'cnn')
                else:
                    assert False, ' not supported env_type'

                make_model = lambda: Model(policy=policy,
                                           ob_space=ob_space,
                                           ac_space=ac_space,
                                           nbatch_act=1,
                                           nbatch_train=1,
                                           nsteps=1,
                                           ent_coef=0.,
                                           vf_coef=0.,
                                           max_grad_norm=0.)
                self.model = make_model()

                self.model_path = path
                self.model.load(path)

        self.stochastic = stochastic
示例#7
0
def main7():
    retro.data.add_custom_integration("custom")

    def wrap_deepmind_n64(env, reward_scale=1 / 100.0, frame_stack=1, grayscale=False):
        env = MaxAndSkipEnv(env, skip=4)
        env = WarpFrame(env, width=150, height=100, grayscale=grayscale)
        env = FrameStack(env, frame_stack)
        env = ScaledFloatFrame(env)
        env = RewardScaler(env, scale=1 / 100.0)
        return env

    def make_env():
        retro.data.add_custom_integration("custom")
        env = retro.n64_env.N64Env(game="SuperSmashBros-N64",
                                   use_restricted_actions=retro.Actions.MULTI_DISCRETE,
                                   inttype=retro.data.Integrations.CUSTOM,
                                   obs_type=retro.Observations.IMAGE)
        env = wrap_deepmind_n64(env)
        return env

    gpu_options = tf.GPUOptions(allow_growth=True)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    nenvs = 2
    # env = DummyVecEnv([make_env] * nenvs)
    env = SubprocVecEnv([make_env] * nenvs)
    network_name = "impala_cnn_lstm"
    policy = build_policy(env, network_name)
    recurrent = "lstm" in network_name
    ob_space = env.observation_space
    ac_space = env.action_space
    nsteps = 10
    nminibatches = 2
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nbatch_act=nenvs,
                  nbatch_train=nbatch_train,
                  nsteps=nsteps,
                  ent_coef=0.01,
                  vf_coef=0.5,
                  max_grad_norm=0.5,
                  comm=None,
                  mpi_rank_weight=1)
    runner = Runner(env=env, model=model, nsteps=10, gamma=.99, lam=.95)

    env.reset()
    num_steps = 20000
    action = [np.array([0, 0, 0]), np.array([0, 0, 0])]
    for i in range(num_steps):
        sys.stdout.write(f"\r{i+1} / {num_steps}")
        action = [env.action_space.sample() for _ in range(nenvs)]
        obs, reward, dones, info = env.step(action)
        # env.reset(dones)
        # env.render()

        if i % 50 == 0:
            if recurrent:
                fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20, 12))
            else:
                fig, axs = plt.subplots(nrows=4, ncols=2, figsize=(20, 12))
            for env_index in range(nenvs):
                if recurrent:
                    axs[env_index].imshow(obs[env_index, :, :, :])
                else:
                    for j in range(4):
                        row = env_index * 2 + j // 2
                        col = j % 2
                        print(row)
                        print(col)
                        axs[row, col].imshow(obs[env_index, :, :, j])
            plt.show()
            plt.close()
    end = time.time()
    print(end - start)

    return env
示例#8
0
    def __init__(self, env, env_type, nenv=4, batch_size=64, gpu=True):
        from baselines.common.policies import build_policy
        from baselines.ppo2.model import Model

        self.graph = tf.Graph()

        if gpu:
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
        else:
            config = tf.ConfigProto(device_count={'GPU': 0})

        self.sess = tf.Session(graph=self.graph, config=config)

        with self.graph.as_default():
            with self.sess.as_default():
                ob_space = env.observation_space
                ac_space = env.action_space

                if env_type == 'atari':
                    policy = build_policy(env, 'cnn')
                    target_action = tf.placeholder(tf.int32, [batch_size])
                elif env_type == 'mujoco':
                    policy = build_policy(env, 'mlp')
                    target_action = tf.placeholder(
                        tf.float32, [batch_size, ac_space.shape[0]])
                else:
                    assert False, ' not supported env_type'

                make_model = lambda: Model(policy=policy,
                                           ob_space=ob_space,
                                           ac_space=ac_space,
                                           nbatch_act=nenv,
                                           nbatch_train=batch_size,
                                           nsteps=1,
                                           ent_coef=0.,
                                           vf_coef=0.,
                                           max_grad_norm=0.)
                self.model = make_model()

                self.inp = self.model.train_model.X  # This is also placeholder
                self.target_action = target_action

                self.ac_logits = self.model.train_model.pi

                if env_type == 'atari':
                    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits=self.ac_logits, labels=self.target_action)
                elif env_type == 'mujoco':
                    loss = tf.reduce_sum((self.ac_logits - self.l)**2, axis=1)

                self.loss = tf.reduce_mean(loss, axis=0)

                policy_optim = tf.train.AdamOptimizer(1e-4)
                policy_params = tf.trainable_variables('ppo2_model/pi')
                self.update_op = policy_optim.minimize(self.loss,
                                                       var_list=policy_params)

                # Value Fn Optimization
                self.R = R = tf.placeholder(tf.float32, [None])
                self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
                self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

                vpred = self.model.train_model.vf
                vpredclipped = OLDVPRED + tf.clip_by_value(
                    vpred - OLDVPRED, -CLIPRANGE, CLIPRANGE)
                # Unclipped value
                vf_losses1 = tf.square(vpred - R)
                # Clipped value
                vf_losses2 = tf.square(vpredclipped - R)

                self.vf_loss = .5 * tf.reduce_mean(
                    tf.maximum(vf_losses1, vf_losses2))

                value_optim = tf.train.AdamOptimizer(1e-4)
                value_params = tf.trainable_variables('ppo2_model/vf')
                self.value_update_op = value_optim.minimize(
                    self.vf_loss, var_list=value_params)

                ################ Miscellaneous
                self.init_op = tf.group(tf.global_variables_initializer(),
                                        tf.local_variables_initializer())

            self.sess.run(self.init_op)
示例#9
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    total_timesteps = 1_000_000  ## now this counts steps in testing runs
    use_vf_clipping = True

    ## From random_ppo.py
    max_grad_norm = 0.5
    vf_coef = 0.5
    L2_WEIGHT = 10e-4
    FM_COEFF = 0.002
    REAL_THRES = 0.1

    parser = argparse.ArgumentParser(
        description='Process procgen testing arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=1000)
    ## default starting_level set to 50 to test on unseen levels!
    parser.add_argument('--start_level', type=int, default=1000)
    parser.add_argument('--run_id', '-id', type=int, default=0)
    parser.add_argument('--load_id', type=int, default=0)
    parser.add_argument('--nrollouts', '-nroll', type=int, default=0)

    args = parser.parse_args()
    args.total_timesteps = total_timesteps
    if args.nrollouts:
        total_timesteps = int(args.nrollouts * num_envs * nsteps)
    run_ID = 'run_' + str(args.run_id).zfill(2)
    run_ID += '_load{}'.format(args.load_id)

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    mpi_rank_weight = 0
    num_levels = args.num_levels

    log_comm = comm.Split(0, 0)
    format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else []

    logpath = join(LOG_DIR, run_ID)
    if not os.path.exists(logpath):
        os.system("mkdir -p %s" % logpath)

    fpath = join(logpath, 'args_{}.json'.format(run_ID))
    with open(fpath, 'w') as fh:
        json.dump(vars(args), fh, indent=4, sort_keys=True)
    print("\nSaved args at:\n\t{}\n".format(fpath))

    logger.configure(dir=logpath, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.compat.v1.Session(config=config)
    sess.__enter__()

    logger.info("Testing")
    ## Modified based on random_ppo.learn
    env = venv
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    nrollouts = total_timesteps // nbatch

    network = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)
    policy = build_policy(env, network)
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nbatch_act=nenvs,
                  nbatch_train=nbatch_train,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm)

    LOAD_PATH = "log/vanilla/saved_vanilla_v{}.tar".format(args.load_id)
    model.load(LOAD_PATH)
    logger.info("Model pramas loaded from save")
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

    epinfobuf10 = deque(maxlen=10)
    epinfobuf100 = deque(maxlen=100)
    # tfirststart = time.time() ## Not doing timing yet
    # active_ep_buf = epinfobuf100

    mean_rewards = []
    datapoints = []
    for rollout in range(1, nrollouts + 1):
        logger.info('collecting rollouts {}...'.format(rollout))
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  ## differnent from random_ppo!
        epinfobuf10.extend(epinfos)
        epinfobuf100.extend(epinfos)

        rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10])
        rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100])
        ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10])
        ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100])

        logger.info('\n----', rollout)
        mean_rewards.append(rew_mean_10)
        logger.logkv('eprew10', rew_mean_10)
        logger.logkv('eprew100', rew_mean_100)
        logger.logkv('eplenmean10', ep_len_mean_10)
        logger.logkv('eplenmean100', ep_len_mean_100)
        logger.logkv("misc/total_timesteps", rollout * nbatch)

        logger.info('----\n')
        logger.dumpkvs()
    env.close()

    print("Rewards history: ", mean_rewards)
    return mean_rewards
示例#10
0
from baselines.common.cmd_util import make_vec_env

frame_stack_size = 4
env = make_vec_env('AssaultNoFrameskip-v0', 'atari', 1, 0)
env = VecFrameStack(env, frame_stack_size)

ob_space = env.observation_space
ac_space = env.action_space
network_type = 'cnn'

policy_network_fn = get_network_builder(network_type)()
network = policy_network_fn(ob_space.shape)

model = Model(ac_space=ac_space,
              policy_network=network,
              ent_coef=0.0,
              vf_coef=0.5,
              max_grad_norm=0.5)

ckpt = tf.train.Checkpoint(model=model)
manager = tf.train.CheckpointManager(ckpt, '../models/PPO22', max_to_keep=None)
ckpt.restore(manager.latest_checkpoint)

obs = env.reset()

state = model.initial_state

episode_reward = 0
while True:
    if state is not None:
        actions, _, state, _ = model.step(obs)
示例#11
0
        stacked=True,
        include_rendering=args.render)
    env.observation_space = Box(low=0, high=255, shape=(72, 96, 16))
    policy = build_policy(env=env, policy_network='cnn')

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Instantiate the model object (that creates act_model and train_model)
    model = Model(policy=policy,
                  ob_space=env.observation_space,
                  ac_space=Discrete(19),
                  nbatch_act=1,
                  nbatch_train=16,
                  nsteps=128,
                  ent_coef=0.01,
                  vf_coef=0.05,
                  max_grad_norm=0.5,
                  comm=None,
                  mpi_rank_weight=1)

    model.load(args.checkpoint)
    # model = PPO2.load(args.checkpoint)
    # player = get_player(args=args)
    for _ in range(args.how_many):
        obs = env.reset()
        cnt = 1
        dones = [False]
        states = model.initial_state
        done = False
示例#12
0
def eval(args):
    logdir = str(Path(args.logbase_path) / args.env_id)

    env = gym.make(args.env_id)

    valid_agents = []
    models = sorted(Path(args.learners_path).glob('?????'))
    for path in models:
        if path.name > args.max_chkpt:
            continue
        agent = PPO2Agent(env,
                          args.env_type,
                          str(path),
                          stochastic=args.stochastic)
        valid_agents.append(agent)

    test_agents = []
    for i, path in enumerate(models):
        if i % 10 == 0:
            agent = PPO2Agent(env,
                              args.env_type,
                              str(path),
                              stochastic=args.stochastic)
            test_agents.append(agent)

    gt_dataset = GTDataset(env)
    gt_dataset.prebuilt(valid_agents, -1)

    gt_dataset_test = GTDataset(env)
    gt_dataset_test.prebuilt(test_agents, -1)

    models = []
    for i in range(args.num_models):
        with tf.variable_scope('model_%d' % i):
            models.append(
                Model(args.include_action,
                      env.observation_space.shape[0],
                      env.action_space.shape[0],
                      steps=args.steps))

    ### Initialize Parameters
    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    # Training configuration
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.InteractiveSession()

    sess.run(init_op)

    for i, model in enumerate(models):
        model.saver.restore(sess, logdir + '/model_%d.ckpt' % (i))

        print('model %d' % i)
        obs, acs, r = gt_dataset.trajs
        r_hat = model.get_reward(obs, acs)

        obs, acs, r_test = gt_dataset_test.trajs
        r_hat_test = model.get_reward(obs, acs)

        fig, axes = plt.subplots(1, 2)
        axes[0].plot(r, r_hat, 'o')
        axes[1].plot(r_test, r_hat_test, 'o')
        fig.savefig('model_%d.png' % i)
        imgcat(fig)
        plt.close(fig)

        np.savez('model_%d.npz' % i,
                 r=r,
                 r_hat=r_hat,
                 r_test=r_test,
                 r_hat_test=r_hat_test)
示例#13
0
def train(args):
    logdir = Path(args.log_dir)

    if logdir.exists():
        c = input(
            'log dir is already exist. continue to train a preference model? [Y/etc]? '
        )
        if c in ['YES', 'yes', 'Y']:
            import shutil
            shutil.rmtree(str(logdir))
        else:
            print('good bye')
            return

    logdir.mkdir(parents=True)
    with open(str(logdir / 'args.txt'), 'w') as f:
        f.write(str(args))

    logdir = str(logdir)
    env = gym.make(args.env_id)

    train_agents = [RandomAgent(env.action_space)] if args.random_agent else []

    models = sorted([
        p for p in Path(args.learners_path).glob('?????')
        if int(p.name) <= args.max_chkpt
    ])
    for path in models:
        agent = PPO2Agent(env,
                          args.env_type,
                          str(path),
                          stochastic=args.stochastic)
        train_agents.append(agent)

    if args.preference_type == 'gt':
        dataset = GTDataset(env)
    elif args.preference_type == 'gt_traj':
        dataset = GTTrajLevelDataset(env)
    elif args.preference_type == 'gt_traj_no_steps':
        dataset = GTTrajLevelNoStepsDataset(env, args.max_steps)
    elif args.preference_type == 'gt_traj_no_steps_noise':
        dataset = GTTrajLevelNoSteps_Noise_Dataset(env, args.max_steps,
                                                   args.traj_noise)
    elif args.preference_type == 'gt_traj_no_steps_n_mix':
        dataset = GTTrajLevelNoSteps_N_Mix_Dataset(env, args.N, args.max_steps)
    elif args.preference_type == 'time':
        dataset = LearnerDataset(env, args.min_margin)
    else:
        assert False, 'specify prefernce type'

    dataset.prebuilt(train_agents, args.min_length)

    models = []
    for i in range(args.num_models):
        with tf.variable_scope('model_%d' % i):
            models.append(
                Model(args.include_action,
                      env.observation_space.shape[0],
                      env.action_space.shape[0],
                      steps=args.steps,
                      num_layers=args.num_layers,
                      embedding_dims=args.embedding_dims))

    ### Initialize Parameters
    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    # Training configuration
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.InteractiveSession()

    sess.run(init_op)

    for i, model in enumerate(models):
        D = dataset.sample(args.D,
                           args.steps,
                           include_action=args.include_action)

        if D is None:
            model.train_with_dataset(dataset,
                                     64,
                                     include_action=args.include_action,
                                     debug=True)
        else:
            model.train(D,
                        l2_reg=args.l2_reg,
                        noise_level=args.noise,
                        debug=True)

        model.saver.save(sess,
                         logdir + '/model_%d.ckpt' % (i),
                         write_meta_graph=False)
示例#14
0
def learn(env,
          nenvs,
          network,
          password,
          total_timesteps=1e6,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          save_path=None,
          load_path=None,
          **network_kwargs):

    set_global_seeds(seed)
    save_dir = save_path

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, value_network='copy', **network_kwargs)

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nbatch_act=nenvs,
                  nbatch_train=nbatch_train,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm)

    if load_path is not None:
        model.load(load_path)

    # Instantiate the runner object
    runner = ProcessRunner(env=env,
                           model=model,
                           n_env=nenvs,
                           n_steps=nsteps,
                           gamma=gamma,
                           lam=lam,
                           password=password,
                           verbose=0,
                           **network_kwargs)

    epinfobuf = deque(maxlen=100)

    # Start total timer
    tfirststart = time.time()

    nupdates = total_timesteps // nbatch
    if save_interval is None:
        save_interval = nupdates // 5

    for update in range(1, nupdates + 1):
        logger.log("# " + "=" * 78)
        logger.log("# Iteration %i / %i" % (update, nupdates))
        logger.log("# " + "=" * 78)
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)
        # Get minibatch
        policy_param = get_session().run(
            tf.trainable_variables('ppo2_model/pi'))
        valfn_param = get_session().run(
            tf.trainable_variables('ppo2_model/vf'))
        obs, rewards, returns, masks, actions, values, neglogpacs, action_mean, states, epinfos, dataset_total_rew = runner.run(
            policy_param, valfn_param)  #pylint: disable=E0632
        ## !! TEST !!
        # with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
        # da_, v_, nglp_, mean_, std_, logstd_ = policy().step_debug(obs, actions)
        # if not ((np.isclose(da_, action_mean, atol=5e-7)).all()):
        # print(da_ - action_mean)
        # print("action no match")
        # if not ((np.isclose(v_, values, atol=5e-7)).all()):
        # print(v_ - values)
        # print("value no match")
        # if not ((np.isclose(nglp_, neglogpacs, atol=5e-7)).all()):
        # print(nglp_-neglogpacs)
        # print("neglogp no match")
        # __import__('ipdb').set_trace()
        # print("Debugging!")
        ## !! TEST !!

        epinfobuf.extend(epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.time()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('dataset_rew', dataset_total_rew / nenvs)
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('serial_num_dones', int(masks.sum() / nenvs))
            logger.logkv('total_num_dones', masks.sum())
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()
        if (update == nupdates and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)) \
                or \
                (save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)):
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            save_path = osp.join(checkdir, '%.5i' % update)
            print('Saving TF model to', save_path)
            model.save(save_path)
            save_dataset(save_path, nsteps, obs, rewards, returns, masks,
                         actions, values)
            save_model_to_yaml(save_path, **network_kwargs)

    ## !! TEST !! ##
    # with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
    # __import__('ipdb').set_trace()
    # while(True):
    # da_, v_, nglp_, mean_, std_, logstd_ = policy().step_debug(obs, actions)
    ## !! TEST !! ##

    return model
    load_path = 'ppo2_lstm_slow.h5'
    # load_path = 'ppo2_base_delayed2.h5'
    # load_path = 'models15/ppo_model_1.h5'
    # model_i = 3
    model_i = ''
    # load_path = 'models/%s.h5' % model_i

    max_ticks = int(60*3*(1/0.016))
    env = HaxballProcPoolVecEnv(num_fields=nenvs, max_ticks=max_ticks)
    policy = build_policy(env=env, policy_network='lstm', nlstm=nlstm)
    # policy = build_policy(env=env, policy_network='lstm', nlstm=512)  # num_layers=4, num_hidden=256)

    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    model = PPOModel(policy=policy, ob_space=env.observation_space, ac_space=env.action_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=0.05, vf_coef=0.5, max_grad_norm=0.5)  # 0.005) #, vf_coef=0.0)
    if load_path is not None and os.path.exists(load_path):
        model.load(load_path)
    # model = StaticModel()
    # model = RandomModel(action_space=env.action_space)
    # model = PazzoModel(action_space=env.action_space)
    # model = StaticModel(default_action=7, action_space=env.action_space)
    # model = StaticModel(action_space=env.action_space)
    # nbatch = 100 * 12
    # nbatch_train = nbatch // 4
    # model = PPOModel(policy=policy, nsteps=12, ent_coef=0.05, ob_space=env.observation_space, ac_space=env.action_space, nbatch_act=100, nbatch_train=nbatch_train, vf_coef=0.5, max_grad_norm=0.5)# 0.005) #, vf_coef=0.0)


    size = width, height = 900, 520
    center = (width // 2, height // 2 + 30)
    black = 105, 150, 90