Пример #1
0
def train(env_id, num_frames, seed, num_cpu):
    num_timesteps = int(num_frames / 4 * 1.1)

    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env = gym.wrappers.Monitor(
                env,
                directory='/home/vasu/Desktop/acktr_json',
                force=True,
                video_callable=False,
                write_upon_reset=True)
            env.seed(seed + rank)
            if logger.get_dir():
                env = bench.Monitor(
                    env,
                    os.path.join(logger.get_dir(),
                                 "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    policy_fn = CnnPolicy
    learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu)
    env.close()
Пример #2
0
def train(env_id, num_frames, seed, num_cpu, save_interval, ckpt_dir):
    num_timesteps = int(num_frames / 4 * 1.1)

    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            # check to ensure full action space is used
            assert env.action_space.n == 18, "amount of actions in action space is :{}, not equal to full action space".format(
                env.action_space.n)
            env.seed(seed + rank)
            if logger.get_dir():
                env = bench.Monitor(
                    env,
                    os.path.join(logger.get_dir(),
                                 "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    policy_fn = CnnPolicy
    learn(policy_fn,
          env,
          seed,
          total_timesteps=num_timesteps,
          nprocs=num_cpu,
          save_interval=save_interval,
          ckpt_dir=ckpt_dir)
    env.close()
Пример #3
0
def train(env_id, num_frames, seed, load_path, num_cpu):
    num_timesteps = int(num_frames // 4)

    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            if logger.get_dir():
                env = bench.Monitor(
                    env,
                    os.path.join(logger.get_dir(),
                                 "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    policy_fn = CnnPolicy
    learn(policy_fn,
          env,
          seed,
          total_timesteps=num_timesteps,
          load_path=load_path,
          nprocs=num_cpu)
    env.close()
Пример #4
0
def train(num_timesteps, seed, num_cpu):
    # TODO: Just f****n ugly handle that better
    def make_env(rank):
        def _thunk():
            print(rank)
            if num_cpu == 1:
                env = MarioEnv(num_steering_dir=11)
            else:
                env = MarioEnv(num_steering_dir=11, num_env=rank)
            env.seed(seed + rank)
            env = bench.Monitor(
                env,
                logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            gym.logger.setLevel(logging.WARN)
            return env

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    policy_fn = OurAcktrPolicy
    learn(policy_fn,
          env,
          seed,
          nsteps=4,
          total_timesteps=int(num_timesteps * 1.1),
          nprocs=num_cpu,
          save_interval=10)
    env.close()
Пример #5
0
def train(env_id, num_timesteps, seed, num_cpu):
    env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
    policy_fn = CnnPolicy
    learn(policy_fn,
          env,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          nprocs=num_cpu)
    env.close()
Пример #6
0
def train(env_id, num_timesteps, seed, num_cpu):
    """
    train an ACKTR model on atari

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param num_cpu: (int) The number of cpu to train on
    """
    env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
    policy_fn = partial(CnnPolicy, one_dim_bias=True)
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu)
    env.close()
Пример #7
0
def train(env_id, policy_fn, num_timesteps, seed, num_cpu):
    def make_env(rank):
        def _thunk():
            env = make_atari(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)
        return _thunk
    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu)
    env.close()
Пример #8
0
def train(env_id, num_timesteps, seed, num_cpu, num_env):
    env = VecFrameStack(
        # make_atari_env(env_id, num_cpu, seed),
        make_distributed_env(env_id, num_env, seed),
        # make_old_dist_env(env_id, num_env, seed),
        4)
    policy_fn = partial(CnnPolicy, one_dim_bias=True)
    learn(policy_fn,
          env,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          nprocs=num_cpu)
    env.close()
Пример #9
0
def train(env_id, num_frames, seed, num_cpu):
    num_timesteps = int(num_frames / 4 * 1.1) 
    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            if logger.get_dir():
                env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)
        return _thunk
    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    policy_fn = CnnPolicy
    learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu)
    env.close()
Пример #10
0
def train(env_id, num_frames, seed, num_cpu):
    num_timesteps = int(num_frames / 4 * 1.1) 
    def make_env(rank):
        def _thunk():
            env_spec = gym.spec('ppaquette/DoomBasic-v0')
            env_spec.id = 'DoomBasic-v0'
            env = env_spec.make()
            env.seed(seed + rank)
            env = PreprocessImage((SkipWrapper(4)(ToDiscrete("minimal")(env))))
            if logger.get_dir():
                env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return ScaleRewardEnv(env)
        return _thunk
    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    policy_fn = CnnPolicy
    learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu, nstack=1)
    env.close()
Пример #11
0
def train(params):
    policy_fn = CnnPolicy

    dataflow_config = {
        'future_rewards': True,             # Should return future discounted rewards?
        'exclude_zero_actions': False,      # Should exclude zero actions
        'remap_actions': False,             # Should remap to smaller action set?
        'clip_rewards': True,               # Clip rewards to [-1, 1]
        'monte-specific-blackout': True,    # Cover up score and lives indicators
        'pong-specific-blackout': False,    # Cover up scores in pong
        'gamma': params.gamma,              # reward discount factor
        'frame_history': 4,                 # What is minimum number of expert frames since beginning of episode?
        'frameskip': 4,                     # frameskip
        'preload_images': True,             # Preload images from hard drive or keep reloading ?
        'gdrive_data_id': cnst.MONTE_DATA_GDRIVE_ID,
        'data_dir': cnst.DATA_DIR,
        'img_dir': cnst.MIKE_IMG_DIR,
        'traj_dir': cnst.MIKE_TRAJECTORIES_DIR,
        'stat_dir': cnst.MIKE_STATES_DIR,
        'batch_size': params.expert_nbatch,
        'max_score_cutoff': params.exp_max_score,  # What is maximum expert score we can show? Used to cut expert data
        'min_score_cutoff': 20000,                 # What is minimum score to count trajectory as expert
        'process_lost_lifes': True,                # Should loss of life zero future discounted reward?
        'use_n_trajectories': params.use_n_trajectories if 'use_n_trajectories' in params else None
    }

    the_seed = np.random.randint(10000)
    print(80 * "SEED")
    print("Today's lucky seed is {}".format(the_seed))
    print(80 * "SEED")

    env = VecFrameStack(
        make_atari_env(
            env_id=params.env,
            num_env=params.num_env,
            seed=the_seed,
            limit_len=params.limit_len,
            limit_penalty=params.limit_penalty,
            death_penalty=params.death_penalty,
            step_penalty=params.step_penalty,
            random_state_reset=params.random_state_reset,
            dataflow_config=dataflow_config
        ),
        params.frame_stack
    )

    learn(
        policy=policy_fn,
        env=env,
        seed=the_seed,
        params=params,
        dataflow_config=dataflow_config,
        expert_nbatch=params.expert_nbatch,
        exp_adv_est=params.exp_adv_est,
        load_model=params.load_model,
        gamma=params.gamma,
        nprocs=params.num_env,
        nsteps=params.nsteps,
        ent_coef=params.ent_coef,
        expert_coeff=params.exp_coeff,
        lr=params.lr,
        lrschedule=params.lrschedule,
    )

    env.close()
Пример #12
0
def train(env_id, num_timesteps, seed, num_cpu):
    env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
    policy_fn = CnnPolicy
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu)
    env.close()