def train(env_id,
          num_env,
          num_timesteps,
          seed,
          policy,
          algo='regular',
          ib_alpha=1e-3):
    # ncpu = multiprocessing.cpu_count()
    # if sys.platform == 'darwin': ncpu //= 2
    # config = tf.ConfigProto(allow_soft_placement=True,
    #                         intra_op_parallelism_threads=ncpu,
    #                         inter_op_parallelism_threads=ncpu)
    # config.gpu_options.allow_growth = True #pylint: disable=E1101
    # tf.Session(config=config).__enter__()

    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
    test_env = VecFrameStack(make_atari_env(env_id, num_env, seed + 1), 4)
    policy = {'cnn_svib': CnnPolicySVIB}[policy]
    reward_list = learn(policy=policy,
                        env=env,
                        seed=seed,
                        test_env=test_env,
                        nsteps=32,
                        nminibatches=4,
                        lam=0.95,
                        gamma=0.99,
                        noptepochs=4,
                        log_interval=10,
                        ent_coef=.01,
                        lr=lambda f: f * 2.5e-4,
                        cliprange=lambda f: f * 0.1,
                        total_timesteps=int(num_timesteps * 1.),
                        algo=algo,
                        ib_alpha=ib_alpha)
    return reward_list
Пример #2
0
def make_vec_envs(env_name,
                  seed,
                  num_processes,
                  gamma,
                  log_dir,
                  add_timestep,
                  allow_early_resets,
                  num_frame_stack=None):
    if isinstance(env_name, dict):
        if num_frame_stack is None and 'num_frame_stack' in env_name:
            num_frame_stack = env_name.get('num_frame_stack')

    envs = [
        make_env(env_name, seed, i, log_dir, add_timestep, allow_early_resets)
        for i in range(num_processes)
    ]

    if len(envs) > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        if gamma is None:
            envs = VecNormalize(envs, ret=False)
        else:
            envs = VecNormalize(envs, gamma=gamma)

    if num_frame_stack is not None:
        if num_frame_stack != 1:
            envs = VecFrameStack(envs, num_frame_stack)
    elif len(envs.observation_space.shape) == 3:
        envs = VecFrameStack(envs, 4)

    return envs
Пример #3
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env,load_path,
          algo='regular', beta=1e-3):
    if policy == 'cnn_svib':
        policy_fn = CnnPolicy
    else:
        policy_fn = CnnPolicy
    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
    test_env = VecFrameStack(make_atari_env(env_id, num_env, seed+1), 4)
    reward_list = learn(policy_fn, env, test_env, seed, total_timesteps=int(num_timesteps),
                        lrschedule=lrschedule, load_path=load_path, algo=algo, beta=beta)
    # env.close()
    return reward_list
Пример #4
0
def train(env_id, clipped_type, num_timesteps, seed, args, policy):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True
    tf.Session(config=config).__enter__()

    env = VecFrameStack(make_atari_env(env_id, num_env=8, seed=seed),
                        4)  # TODO: 注意是8个进程
    policy = {'cnn': CnnPolicy, 'mlp': MlpPolicy}[policy]
    ent_coef = 0.01 if args.clipped_type == 'origin' else 0
    ppo2.learn(
        policy=policy,
        env=env,
        nsteps=128,
        nminibatches=4,
        lam=0.95,
        gamma=0.99,
        noptepochs=4,
        log_interval=1,
        ent_coef=ent_coef,
        lr=lambda f: f * 2.5e-4,
        total_timesteps=int(num_timesteps * 1.1),
        clipped_type=clipped_type,
        args=args,
        save_interval=200,
    )
Пример #5
0
def train(num_timesteps,
          env_name,
          seed,
          policy,
          lrschedule,
          num_env,
          entrophy,
          lr,
          save_name=None):
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    elif policy == 'i2a':
        policy_fn = I2ANetwork
    env = VecFrameStack(make_doom_env(num_env, 0, env_name), 4)
    if save_name is None:
        save_name = env_name
    learn(policy_fn,
          env,
          seed,
          save_name=save_name,
          total_timesteps=int(num_timesteps * 1.1),
          lrschedule=lrschedule,
          log_interval=500,
          save_interval=1000,
          cont=True,
          ent_coef=entrophy,
          lr=lr)
    env.close()
Пример #6
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env, ckpt_path,
          hparams):
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    elif policy == 'cnn_attention':
        policy_fn = CnnAttentionPolicy

    video_log_dir = os.path.join(hparams['base_dir'], 'videos',
                                 hparams['experiment_name'])
    env = VecFrameStack(
        make_atari_env(env_id,
                       num_env,
                       seed,
                       video_log_dir=video_log_dir,
                       write_attention_video='attention' in policy,
                       hparams=hparams), 4)

    learn(policy_fn,
          env,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          lrschedule=lrschedule,
          ckpt_path=ckpt_path,
          hparams=hparams)
    env.close()
Пример #7
0
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env):
    """
    Train A2C model for atari environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
    :param num_env: (int) The number of environments
    """
    policy_fn = None
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    if policy_fn is None:
        raise ValueError("Error: policy {} not implemented".format(policy))

    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lr_schedule=lr_schedule)
    env.close()
Пример #8
0
def build_env(args):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = args.seed

    env_type, env_id = get_env_type(args.env)

    if env_type == 'atari':
        if alg == 'acer':
            env = make_vec_env(env_id, env_type, nenv, seed)
        elif alg == 'deepq':
            env = atari_wrappers.make_atari(env_id)
            env.seed(seed)
            env = bench.Monitor(env, logger.get_dir())
            env = atari_wrappers.wrap_deepmind(env, frame_stack=True)
        elif alg == 'trpo_mpi':
            env = atari_wrappers.make_atari(env_id)
            env.seed(seed)
            env = bench.Monitor(
                env,
                logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
            env = atari_wrappers.wrap_deepmind(env)
            # TODO check if the second seeding is necessary, and eventually remove
            env.seed(seed)
        else:
            frame_stack_size = 4
            env = VecFrameStack(make_vec_env(env_id, env_type, nenv, seed),
                                frame_stack_size)

    elif env_type == 'retro':
        import retro
        gamestate = args.gamestate or retro.State.DEFAULT
        env = retro_wrappers.make_retro(
            game=args.env,
            state=gamestate,
            max_episode_steps=10000,
            use_restricted_actions=retro.Actions.DISCRETE)
        env.seed(args.seed)
        env = bench.Monitor(env, logger.get_dir())
        env = retro_wrappers.wrap_deepmind_retro(env)

    else:
        get_session(
            tf.ConfigProto(allow_soft_placement=True,
                           intra_op_parallelism_threads=1,
                           inter_op_parallelism_threads=1))

        env = make_vec_env(env_id,
                           env_type,
                           args.num_env or 1,
                           seed,
                           reward_scale=args.reward_scale)

        if env_type == 'mujoco':
            env = VecNormalize(env)

    return env
Пример #9
0
def train(env_id, num_timesteps, seed, policy):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    """

    env = VecFrameStack(make_atari_env(env_id, 8, seed), 4)
    policy = {
        'cnn': CnnPolicy,
        'lstm': LstmPolicy,
        'lnlstm': LnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    ppo2.learn(policy=policy,
               env=env,
               n_steps=128,
               nminibatches=4,
               lam=0.95,
               gamma=0.99,
               noptepochs=4,
               log_interval=1,
               ent_coef=.01,
               learning_rate=lambda f: f * 2.5e-4,
               cliprange=lambda f: f * 0.1,
               total_timesteps=int(num_timesteps * 1.1))
Пример #10
0
def build_env(args, extra_args):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = args.seed

    env_type, env_id = get_env_type(args.env)

    if env_type == 'atari':
        if alg == 'acer':
            env = make_vec_env(env_id, env_type, nenv, seed)
        elif alg == 'deepq':
            env = atari_wrappers.make_atari(env_id, None)
            env.seed(seed)
            env = bench.Monitor(env, logger.get_dir())
            env = atari_wrappers.wrap_deepmind(env, frame_stack=True)
        elif alg == 'trpo_mpi':
            env = atari_wrappers.make_atari(env_id, None)
            env.seed(seed)
            env = bench.Monitor(
                env,
                logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
            env = atari_wrappers.wrap_deepmind(env)
            # TODO check if the second seeding is necessary, and eventually remove
            env.seed(seed)
        else:
            frame_stack_size = 4
            weights = extra_args['weights'] if 'weights' in extra_args else None
            env = VecFrameStack(
                make_vec_env(env_id, env_type, nenv, seed, weights=weights),
                frame_stack_size)
    return env
Пример #11
0
def build_env(args):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    seed = args.seed

    env_type, env_id = get_env_type(args.env)

    if env_type in {'atari', 'retro'}:
        if alg == 'deepq':
            env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True})
        elif alg == 'trpo_mpi':
            env = make_env(env_id, env_type, seed=seed)
        else:
            frame_stack_size = 4
            env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale)
            env = VecFrameStack(env, frame_stack_size)

    else:
       config = tf.ConfigProto(allow_soft_placement=True,
                               intra_op_parallelism_threads=1,
                               inter_op_parallelism_threads=1,
                               gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.20))
       config.gpu_options.allow_growth = True
       get_session(config=config)
       
       flatten_dict_observations = alg not in {'her'}
       env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale, flatten_dict_observations=flatten_dict_observations)
       
       normalize_value = args.normalize_value
       if (env_type == 'mujoco' or env_type=='roboschool') and normalize_value:
           env = VecNormalize(env)

    return env
Пример #12
0
    def setup_eval_env(self, env_name, seed):
        if env_name == "spaceinvaders":
            env_id = "SpaceInvadersNoFrameskip-v4"
        elif env_name == "mspacman":
            env_id = "MsPacmanNoFrameskip-v4"
        elif env_name == "videopinball":
            env_id = "VideoPinballNoFrameskip-v4"
        elif env_name == "beamrider":
            env_id = "BeamRiderNoFrameskip-v4"
        else:
            env_id = env_name[0].upper() + env_name[1:] + "NoFrameskip-v4"
        env_type = "atari"
        #env id, env type, num envs, and seed
        env = make_vec_env(env_id,
                           env_type,
                           1,
                           seed,
                           wrapper_kwargs={
                               'clip_rewards': False,
                               'episode_life': False,
                           })
        if env_type == 'atari':
            env = VecFrameStack(env, 4)

        print("env actions", env.action_space)
        return env
Пример #13
0
def train(env_id, num_timesteps, seed, policy):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    gym.logger.setLevel(logging.WARN)
    tf.Session(config=config).__enter__()
    nenvs = 8
    def make_env(rank):
        def env_fn():
            print(rank)
            if nenvs == 1:
                env = MarioEnv(num_steering_dir=11, jump=True)
            else:
                env = MarioEnv(num_steering_dir=11, num_env=rank, jump=True)
            env.seed(seed + rank)
            env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
            gym.logger.setLevel(logging.WARN)
            return env
        return env_fn

    env = SubprocVecEnv([make_env(i) for i in range(nenvs)])
    set_global_seeds(seed)
    env = VecFrameStack(env, 4)
    policy = {'cont': ContCnnPolicy, 'cnn' : OurCNN2, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy}[policy]
    ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4,
        lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
        ent_coef=.01,
        lr=lambda f : f * 1e-3,
        cliprange=lambda f : f * 0.1,
        total_timesteps=int(num_timesteps * 1.1),
               save_interval=10)
Пример #14
0
def train(env_id, num_timesteps, seed, policy):

    ncpu = multiprocessing.cpu_count()

    if sys.platform == 'darwin': ncpu //= 2
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    tf.Session(config=config).__enter__()

    env = VecFrameStack(make_atari_env(env_id, 8, seed), 4)
    policy = {
        'cnn': CnnPolicy,
        'lstm': LstmPolicy,
        'lnlstm': LnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    ppo2.learn(policy=policy,
               env=env,
               nsteps=128,
               nminibatches=4,
               lam=0.95,
               gamma=0.99,
               noptepochs=4,
               log_interval=1,
               ent_coef=.01,
               lr=lambda f: f * 2.5e-4,
               cliprange=lambda f: f * 0.1,
               total_timesteps=int(num_timesteps * 1.1))
Пример #15
0
def build_env(args):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    seed = args.seed

    env_type, env_id = get_env_type(args.env)

    if env_type in {'atari', 'retro'}:
        if alg == 'deepq':
            env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True})
        elif alg == 'trpo_mpi':
            env = make_env(env_id, env_type, seed=seed)
        else:
            frame_stack_size = 4
            env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale)
            env = VecFrameStack(env, frame_stack_size)

    else:
       config = tf.ConfigProto(allow_soft_placement=True,
                               intra_op_parallelism_threads=1,
                               inter_op_parallelism_threads=1)
       config.gpu_options.allow_growth = True
       get_session(config=config)

       env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale)

       if env_type == 'mujoco':
           env = VecNormalize(env)

    return env
Пример #16
0
def learn(env_path, seed, max_steps, reward_range, base_port, unity_arguments,
          summary_writer):
    env = VecFrameStack(_make_a2c(env_path,
                                  num_env=8,
                                  seed=seed,
                                  reward_range=reward_range,
                                  base_port=base_port,
                                  unity_arguments=unity_arguments),
                        nstack=4)

    model = learn_a2c(
        policy=CnnPolicy,
        env=env,
        seed=seed,
        ent_coef=0.01,
        nsteps=5,
        total_timesteps=max_steps,
        callback=_create_summary_callback(summary_writer=summary_writer))

    try:
        env.close()
    except Exception as e:
        print("Failed to close environment: " + str(e))

    return model
Пример #17
0
def build_env(args, silent_monitor, prio_args=None):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    seed = args.seed

    env_type, env_id = get_env_type(args.env)

    if env_type in {'atari', 'retro'}:
        if alg == 'deepq':
            env = make_env(env_id,
                           env_type,
                           seed=seed,
                           wrapper_kwargs={'frame_stack': True})
        elif alg == 'trpo_mpi':
            env = make_env(env_id, env_type, seed=seed)
        else:
            frame_stack_size = 4
            env = make_vec_env(env_id,
                               env_type,
                               nenv,
                               seed,
                               gamestate=args.gamestate,
                               reward_scale=args.reward_scale,
                               prio_args=prio_args,
                               silent_monitor=silent_monitor)
            if prio_args is None:
                env = VecFrameStack(env, frame_stack_size)
            else:
                env = PrioVecFrameStack(env, frame_stack_size)

            # TODO prio vec frame stack

    else:
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=1,
                                inter_op_parallelism_threads=1)
        config.gpu_options.allow_growth = True
        get_session(config=config)

        num_env = args.n_active_envs if prio_args is None else args.num_env
        flatten_dict_observations = alg not in {'her'}
        env = make_vec_env(env_id,
                           env_type,
                           num_env or 1,
                           seed,
                           reward_scale=args.reward_scale,
                           flatten_dict_observations=flatten_dict_observations,
                           prio_args=prio_args,
                           silent_monitor=silent_monitor)

        if env_type == 'mujoco':
            if prio_args is None:
                env = VecNormalize(env)
            else:
                env = PrioVecNormalize(env)

    return env
Пример #18
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env,load_path,
          algo='use_svib_uniform', ib_alpha=1e-3):
    if policy == 'cnn_svib':
        policy_fn = CnnPolicySVIB
    else:
        policy_fn = CnnPolicySVIB
    if 'NoFrameskip' in env_id:
        env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
        test_env = VecFrameStack(make_atari_env(env_id, num_env, seed+1), 4)
    else:
        env = VecFrameStack(make_atari_env_low_dim(env_id, num_env, seed), 4)
        test_env = VecFrameStack(make_atari_env_low_dim(env_id, num_env, seed+1), 4)
        # train_mine_env = VecFrameStack(make_atari_env_low_dim(env_id, num_env, seed), 4)
    reward_list, value_list = learn(policy_fn, env, test_env, seed, total_timesteps=int(num_timesteps),
                        lrschedule=lrschedule, load_path=load_path, algo=algo, ib_alpha=ib_alpha)
    env.close()
    return reward_list, value_list
Пример #19
0
def build_pend_env(args, **kwargs):
    alg = args.alg
    seed = args.seed

    flatten_dict_observations = alg not in {'her'}
    env = make_vec_env(args.env, 'atari', args.num_env or 1, seed, reward_scale=args.reward_scale,
                       flatten_dict_observations=flatten_dict_observations)
    return VecFrameStack(env, k, norm_frac=255)
Пример #20
0
def train(env_id, learning_rate, max_learning_rate, num_epoch, buffer_size,
          batch_size, num_timesteps, num_workers, seed, policy, load_path,
          frame_skip):

    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin':
        ncpu //= 2
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    # nenv = num_workers or ncpu if not render else 1
    # alg = policy
    # rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    env = VecFrameStack(
        make_neyboy_env(env_id, num_workers, seed, frame_skip=frame_skip), 4)

    nsteps = buffer_size // num_workers
    nminibatches = buffer_size // batch_size

    logger.info('buffer_size={}'.format(buffer_size))
    logger.info('batch_size={}'.format(batch_size))
    logger.info('num-workers={}'.format(num_workers))
    logger.info('nsteps={}'.format(nsteps))
    logger.info('nminibatches={}'.format(nminibatches))
    logger.info('noptepochs={}'.format(num_epoch))
    logger.info('lr={}'.format(learning_rate))
    logger.info('max_lr={}'.format(max_learning_rate))
    logger.info('load_path={}'.format(load_path))
    logger.info('frame_skip={}'.format(frame_skip))

    total_timesteps = int(num_timesteps * 1.1)

    def lr_fn(frac, iteration):
        num_iterations = nminibatches
        stepsize = num_iterations // 2
        base_lr = frac * learning_rate
        max_lr = frac * max_learning_rate
        cycle = np.floor(1 + iteration / (2 * stepsize))
        x = np.abs(iteration / stepsize - 2 * cycle + 1)
        lr = base_lr + (max_lr - base_lr) * np.maximum(0, (1 - x))
        return lr

    ppo2.learn(network=policy,
               env=env,
               nsteps=nsteps,
               nminibatches=nminibatches,
               lam=0.95,
               gamma=0.99,
               noptepochs=num_epoch,
               log_interval=1,
               ent_coef=0.0,
               lr=lr_fn,
               cliprange=lambda f: f * 0.1,
               total_timesteps=total_timesteps,
               save_interval=10,
               load_path=load_path)
Пример #21
0
def learn(env_id, seed, num_timesteps, batch_size, buffer_size, ent_coef, lr,
          rollout_steps, train_steps, log_every, eval_num):
    # Seed
    set_global_seeds(seed)
    num_env = 4
    num_worker = 8

    env = VecFrameStack(make_atari_env(env_id, 1, seed), 4)
    ob_space = env.observation_space
    ac_space = env.action_space

    cluster = tf.train.ClusterSpec({
        'actor': ['localhost:%d' % (10001 + i) for i in range(num_env)],
        'ps': ['localhost:10000'],
        'worker': ['localhost:%d' % (11001 + i) for i in range(num_worker)]
    })

    # Runner
    input_queue = Queue()
    output_queue = Queue()
    runners = []
    for i in range(num_env):
        runners.append(
            Runner(env_id,
                   seed + i,
                   ob_space,
                   ac_space,
                   output_queue,
                   task_index=i,
                   cluster=cluster))
        runners[i].start()

    # Data Helper
    data_helper = DataHelper(int(buffer_size), input_queue, output_queue,
                             batch_size, rollout_steps, train_steps)
    data_helper.start()

    # Workers
    workers = []
    for i in range(num_worker):
        workers.append(Worker(cluster, input_queue, i, ob_space, ac_space))
        workers[i].start()

    # Model
    server = tf.train.Server(cluster,
                             job_name='ps',
                             task_index=0,
                             config=tf.ConfigProto(device_filters=["/job:ps"]))
    shared_job_device = '/job:ps/task:0'
    sess = tf.Session(server.target)
    with sess.as_default():
        with tf.device(shared_job_device):
            model = Model(sess, ob_space, ac_space, batch_size, lr, ent_coef)
        sess.run(tf.global_variables_initializer())

        for _ in range(100000):
            time.sleep(1)
Пример #22
0
def build_env(args):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    seed = args.seed

    env_type, env_id = get_env_type(args.env)

    print(env_id)
    #extract the agc_env_name
    noskip_idx = env_id.find("NoFrameskip")
    env_name = env_id[:noskip_idx].lower()
    print("Env Name for Masking:", env_name)

    if env_type in {'atari', 'retro'}:
        if alg == 'deepq':
            env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True})
        elif alg == 'trpo_mpi':
            env = make_env(env_id, env_type, seed=seed)
        else:
            frame_stack_size = 4
            env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale)
            env = VecFrameStack(env, frame_stack_size)

    else:
       config = tf.ConfigProto(allow_soft_placement=True,
                               intra_op_parallelism_threads=1,
                               inter_op_parallelism_threads=1)
       config.gpu_options.allow_growth = True
       get_session(config=config)

       env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale)

    if args.custom_reward != '':
        from baselines.common.vec_env import VecEnv, VecEnvWrapper
        import baselines.common.custom_reward_wrapper as W
        assert isinstance(env,VecEnv) or isinstance(env,VecEnvWrapper)

        custom_reward_kwargs = eval(args.custom_reward_kwargs)

        if args.custom_reward == 'pytorch':
            if args.custom_reward_path == '':
                assert False, 'no path for reward model'
            else:
                env = W.VecPyTorchAtariReward(env, args.custom_reward_path, env_name)
        else:
            assert False, 'no such wrapper exist'

    if env_type == 'mujoco':
        env = VecNormalize(env)
    # if env_type == 'atari':
    #     input("Normalizing for ATari game: okay? [Enter]")
    #     #normalize rewards but not observations for atari
    #     env = VecNormalizeRewards(env)

    return env
Пример #23
0
def train(env_id, num_timesteps, seed, num_cpu):
    env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
    policy_fn = CnnPolicy
    learn(policy_fn,
          env,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          nprocs=num_cpu)
    env.close()
Пример #24
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    #env = gym.make(args.env_id)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.CNNPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
    #env = bench.Monitor(env, logger.get_dir() and
    #                    osp.join(logger.get_dir(), "monitor.json"))

    env = make_vec_env(args.env_id, 'atari', 1, args.seed,
                       wrapper_kwargs={
                           'clip_rewards':False,
                           'episode_life':False,
                       })
    env = VecFrameStack(env, 4)

    #env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        dataset = LMDB_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
        reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
        train(env,
              args.seed,
              policy_fn,
              reward_giver,
              dataset,
              args.algo,
              args.g_step,
              args.d_step,
              args.policy_entcoeff,
              args.num_timesteps,
              args.save_per_iter,
              args.checkpoint_dir,
              args.log_dir,
              args.pretrained,
              args.BC_max_iter,
              task_name
              )
    elif args.task == 'evaluate':
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=1024,
               number_trajs=10,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample
               )
    else:
        raise NotImplementedError
    env.close()
Пример #25
0
def train(env_id, num_timesteps, seed, policy):
    from baselines.common import set_global_seeds
    from baselines.common.atari_wrappers import make_atari, wrap_deepmind
    from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
    from baselines.common.vec_env.vec_frame_stack import VecFrameStack
    from baselines.ppo2 import ppo2
    from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy, MlpPolicy
    import gym
    import logging
    import multiprocessing
    import os.path as osp
    import tensorflow as tf
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin':
        ncpu //= 2
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    gym.logger.setLevel(logging.WARN)
    tf.Session(config=config).__enter__()

    def make_env(rank):
        def env_fn():
            env = make_atari(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(
                env,
                logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
            return env  # wrap_deepmind(env)

        return env_fn

    nenvs = 8
    env = SubprocVecEnv([make_env(i) for i in range(nenvs)])
    set_global_seeds(seed)
    env = VecFrameStack(env, 4)
    policy = {
        'cnn': CnnPolicy,
        'lstm': LstmPolicy,
        'lnlstm': LnLstmPolicy,
        'mlp': MlpPolicy,
        'capsules': CapsulesPolicy
    }[policy]
    ppo2.learn(policy=policy,
               env=env,
               nsteps=128,
               nminibatches=4,
               lam=0.95,
               gamma=0.99,
               noptepochs=4,
               log_interval=1,
               ent_coef=.01,
               lr=lambda f: f * 2.5e-4,
               cliprange=lambda f: f * 0.1,
               total_timesteps=int(num_timesteps * 1.1))
Пример #26
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env):
    if policy == u'cnn':
        policy_fn = CnnPolicy
    elif policy == u'lstm':
        policy_fn = LstmPolicy
    elif policy == u'lnlstm':
        policy_fn = LnLstmPolicy
    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
    env.close()
Пример #27
0
def wrap_env_ppo(env):
    env = ThresholdResizeFrame(env)
    # env = WarpFrame(env)
    env = ClipRewardEnv(env)
    # env = NoopResetEnv(env, noop_max=8)
    env = MaxAndSkipEnv(env, skip=4)
    env = Monitor(env, logger.get_dir())
    env = DummyVecEnv([lambda: env])
    env = VecFrameStack(env, 4)
    return env
Пример #28
0
def build_env(args, seed):
    nenv = 1
    alg = args.alg
    # seed = args.seed
    seed = int(np.random.rand(1) * 101000)
    print(seed)

    env_type, env_id = get_env_type(args.env)
    set_global_seeds(seed)
    if env_type in {'atari', 'retro'}:
        if alg == 'deepq':
            env = make_env(env_id,
                           env_type,
                           seed=seed,
                           wrapper_kwargs={'frame_stack': True})
        elif alg == 'trpo_mpi':
            env = make_env(env_id, env_type, seed=seed)
        else:
            frame_stack_size = 4
            env = make_vec_env(env_id,
                               env_type,
                               nenv,
                               seed,
                               gamestate=args.gamestate,
                               reward_scale=args.reward_scale)
            env = VecFrameStack(env, frame_stack_size)

    else:
        # config = tf.ConfigProto(allow_soft_placement=True,
        #                        intra_op_parallelism_threads=1,
        #                        inter_op_parallelism_threads=1)
        # config.gpu_options.allow_growth = True
        # get_session(config=config)
        sess = tf.InteractiveSession()
        # env = VecNormalize(make_vec_env(env_id, env_type, 1, seed, reward_scale=args.reward_scale))

        env = make_vec_env(env_id,
                           env_type,
                           args.numenv,
                           seed,
                           reward_scale=args.reward_scale)
        evalenv = make_vec_env(env_id,
                               env_type,
                               args.numenv,
                               seed,
                               reward_scale=args.reward_scale)

        # if env_type == 'mujoco':
        #     env = VecNormalize(env)
        #     evalenv = VecNormalizeEval(evalenv)
        #     evalenv.ob_rms = env.ob_rms
        #     evalenv.ret_rms = env.ret_rms

    return env, sess, evalenv
Пример #29
0
def main():
    numOfTests = 40
    env_args = {
        'episode_life': False,
        'clip_rewards': False,
        'crop': True,
        'rotate': True
    }
    env = VecFrameStack(
        make_vec_env("gvgai-zelda-lvl0-v0",
                     numOfTests,
                     43,
                     wrapper_kwargs=env_args), 4)
    policy = build_policy(env, "cnn")
    model = Model(policy=policy, env=env, nsteps=5)
    model.load('logs/test_4*5_r1_right/checkpoints/260000')
    nh, nw, nc = env.observation_space.shape
    result = dict()
    for j in range(201, 601):
        # obs = np.zeros((numOfTests, nh, nw, nc), dtype=np.uint8)
        done = np.array([False] * numOfTests)
        env.venv.set_level(
            "GVGAI_GYM/gym_gvgai/envs/games/zelda_v0/zelda_lvl{}.txt".format(
                j))
        obs = env.reset()
        infos = [False] * numOfTests
        # dones = [False] * numOfTests

        while not all(done):
            actions, values, state, _ = model.step(obs)
            obs, rewards, dones, info = env.step(actions)
            done[np.where(dones != False)] = True
            for i in np.where(dones != False)[0].tolist():
                if not infos[i]:
                    # print(info)
                    del info[i]["grid"]
                    del info[i]["ascii"]
                    infos[i] = info[i]
            # print(np.where(dones!=False)[0])
            # print(done)
            # print(infos)

        # print(dones)
        win = [1 if (i['winner'] == 'PLAYER_WINS') else 0 for i in infos]
        # score = [i['episode']['r'] for i in infos]
        # steps = [i['episode']['l'] for i in infos]
        # time = [i['episode']['t'] for i in infos]
        print("level {}".format(j), win)
        result[j] = infos

    env.close()

    with open("result_4*5_r1_right_200~600", "wb") as f:
        pickle.dump(result, f)
Пример #30
0
Файл: run.py Проект: XFFXFF/PPO
def create_env(env_id, n_env, seed):
    def make_env(rank):
        def _thunk():
            env = make_atari(env_id)
            env.seed(seed + rank)
            env = LogWrapper(env)
            return wrap_deepmind(env)
        return _thunk
    env = SubprocVecEnv([make_env(i) for i in range(n_env)])
    env = VecFrameStack(env, 4)
    return env