示例#1
0
def build_env(args):

    env = gym.make(args.env_id)
    env.seed(args.seed)  # to make the result more reproducibility
    env = Monitor(env, logger.get_dir(), allow_early_resets=True)

    return env
示例#2
0
    def make_env(subrank=None):
        env = gym.make(env_name)
        if subrank is not None and logger.get_dir() is not None:
            try:
                from mpi4py import MPI
                mpi_rank = MPI.COMM_WORLD.Get_rank()
            except ImportError:
                MPI = None
                mpi_rank = 0
                logger.warn('Running with a single MPI process. This should work, but the results may differ from the ones publshed in Plappert et al.')

            max_episode_steps = env._max_episode_steps
            env =  Monitor(env,
                           os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(subrank)),
                           allow_early_resets=True)
            # hack to re-expose _max_episode_steps (ideally should replace reliance on it downstream)
            env = gym.wrappers.TimeLimit(env, max_episode_steps=max_episode_steps)
        return env
示例#3
0
def main(args):
    # mpi communicator.
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    # seed.
    workerseed = args.seed + 10000 * comm.Get_rank() if args.seed is not None else None
    if workerseed is not None:
        tc.manual_seed(workerseed % 2 ** 32)
        np.random.seed(workerseed % 2 ** 32)
        random.seed(workerseed % 2 ** 32)

    # logger.
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])

    # env.
    env = make_atari(args.env_name)
    env.seed(workerseed)
    env = Monitor(env, logger.get_dir() and
              os.path.join(logger.get_dir(), str(rank)))
    print(f"frame_stacking: {args.frame_stacking}")
    env = wrap_deepmind(env, frame_stack=args.frame_stacking,
                        clip_rewards=(args.mode =='train'),
                        episode_life=(args.mode =='train'))  # See Mnih et al., 2015 -> Methods -> Training Details.
    env.seed(workerseed)

    # agent.
    agent = CnnPolicy(
        img_channels=env.observation_space.shape[-1],
        num_actions=env.action_space.n,
        kind=args.model_type)

    # optimizer and scheduler.
    max_grad_steps = args.optim_epochs * args.env_steps // (comm.Get_size() * args.optim_batchsize)

    optimizer = tc.optim.Adam(agent.parameters(), lr=args.optim_stepsize, eps=1e-5)
    scheduler = tc.optim.lr_scheduler.OneCycleLR(
        optimizer=optimizer, max_lr=args.optim_stepsize, total_steps=max_grad_steps,
        pct_start=0.0, anneal_strategy='linear', cycle_momentum=False,
        div_factor=1.0)

    # checkpoint.
    if rank == 0:
        try:
            state_dict = tc.load(os.path.join(args.checkpoint_dir, args.model_name, 'model.pth'))
            agent.load_state_dict(state_dict)
            print(f"Continuing from checkpoint found at {os.path.join(args.checkpoint_dir, args.model_name, 'model.pth')}")
        except FileNotFoundError:
            print("Bad checkpoint or none on process 0. Continuing from scratch.")

    # sync.
    with tc.no_grad():
        for p in agent.parameters():
            p_data = p.data.numpy()
            comm.Bcast(p_data, root=0)
            p.data.copy_(tc.tensor(p_data).float())

    # operations.
    if args.mode == 'train':
        learn(env=env, agent=agent, optimizer=optimizer, scheduler=scheduler, comm=comm,
              timesteps_per_actorbatch=args.timesteps_per_actorbatch, max_timesteps=args.env_steps,
              optim_epochs=args.optim_epochs, optim_batchsize=args.optim_batchsize,
              gamma=args.gamma, lam=args.lam, clip_param=args.epsilon, entcoeff=args.ent_coef,
              checkpoint_dir=args.checkpoint_dir, model_name=args.model_name)
        env.close()

    elif args.mode == 'play':
        if comm.Get_rank() == 0:
            play(env=env, agent=agent, args=args)
            env.close()

    elif args.mode == 'movie':
        if comm.Get_rank() == 0:
            movie(env=env, agent=agent, args=args)
            env.close()

    else:
        raise NotImplementedError("Mode of operation not supported!")