Пример #1
0
def common_arg_parser():
    parser = arg_parser()
    parser.add_argument('--env',
                        help='environment ID',
                        type=str,
                        default='Hopper-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--alg', help='Algorithm', type=str, default='bgail')
    parser.add_argument('--reward_scale',
                        help='Reward scale factor. Default: 1.0',
                        type=float,
                        default=1.0)
    parser.add_argument('--save_path',
                        help='Path to save trained model to',
                        type=str,
                        default='./outputs')
    parser.add_argument('--load_path',
                        help='Path to load trained model for evaluation',
                        type=str,
                        default=None)
    parser.add_argument('--render',
                        help='Whether to display the simulation or not',
                        default=False)

    return parser
Пример #2
0
def network_arg_parser():
    parser = arg_parser()
    parser.add_argument('--value_network',
                        '--value-network',
                        type=str,
                        default=None,
                        choices=[None, 'copy', 'shared'],
                        help='bool to decide if value network is to be used')
    parser.add_argument('--normalize_observations',
                        '--normalize-observations',
                        type=bool,
                        default=False,
                        help='decide whether to normalize observations')
    parser.add_argument('--estimate_q',
                        '--estimate-q',
                        type=bool,
                        default=False,
                        help='whether policy should estimate q or v')
    parser.add_argument('--num_layers', '--num-layers', type=int, default=2)
    parser.add_argument('--num_hidden', '--num-hidden', type=int, default=64)
    parser.add_argument('--layer_norm',
                        '--layer-norm',
                        type=bool,
                        default=False)
    return parser
Пример #3
0
def mujoco_arg_parser():
    import ast
    """
    Create an argparse.ArgumentParser for run_mujoco.py.
    """
    parser = arg_parser()
    parser.add_argument('--env',
                        help='environment ID',
                        type=str,
                        default='InvertedPendulum-v2')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--num_timesteps', type=int, default=int(1e6))
    parser.add_argument('--play', default=False, action='store_true')
    parser.add_argument('--clipped_type', default='kl2clip', type=str)
    parser.add_argument('--use_tabular', default=False, type=ast.literal_eval)
    parser.add_argument('--cliprange', default=0.2, type=ast.literal_eval)
    parser.add_argument('--delta_kl', default=None, type=float)
    parser.add_argument('--lr', default=3e-4, type=float)

    # TODO: 修改根路径

    root_dir_default = '/tmp/baselines'
    if not os.path.exists(root_dir_default):
        tools.mkdir(root_dir_default)

    parser.add_argument('--root_dir', default=root_dir_default, type=str)
    parser.add_argument('--sub_dir', default=None, type=str)
    parser.add_argument('--policy_type', default='MlpPolicy', type=str)
    parser.add_argument('--force_write', default=1, type=int)
    return parser
def argparser():
    """
    Create an argparse.ArgumentParser.
    """
    parser = arg_parser()
    parser.add_argument('--env', help='environment ID', type=str, default='CartPole-v0')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
    return parser
Пример #5
0
def neyboy_arg_parser():
    """
    Create an argparse.ArgumentParser for run_neyboy.py.
    """
    parser = arg_parser()
    parser.add_argument('--env', help='environment ID', default='neyboy-v0')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))

    return parser
def argparser():
    """
    Create an argparse.ArgumentParser.
    """
    def str2bool(v):
        if v.lower() in ('yes', 'true', 't', 'y', '1'):
            return True
        elif v.lower() in ('no', 'false', 'f', 'n', '0'):
            return False
        else:
            raise argparse.ArgumentTypeError('Boolean value expected.')

    parser = arg_parser()
    parser.add_argument('--env-id',
                        help='environment ID',
                        type=str,
                        default='RoboschoolReacher-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=int(0))
    parser.add_argument('--num-timesteps', type=int, default=int(0.5e5))
    parser.add_argument('--timesteps_per_episode',
                        type=int,
                        default=int(10000))
    parser.add_argument('--n_policy',
                        help='Number of policies to execute',
                        type=int,
                        default=int(1))
    parser.add_argument('--filepath', type=str, default='/tmp/')
    parser.add_argument('--visualize',
                        help='Load and visualize experiment?',
                        type=str2bool,
                        default=False)
    parser.add_argument('--retrace',
                        help='Use retrace?',
                        type=str2bool,
                        default=False)
    parser.add_argument('--trpo',
                        help='Use TRPO instead of COPOS?',
                        type=str2bool,
                        default=False)
    parser.add_argument('--entropy_bonus',
                        help='Entropy bonus factor',
                        type=float,
                        default=float(0.0))
    parser.add_argument('--epsilon',
                        help='Epsilon',
                        type=float,
                        default=float(0.01))
    parser.add_argument('--beta', help='Beta', type=float, default=float(0.01))
    parser.add_argument('--compatible',
                        help='Use compatible policy?',
                        type=str2bool,
                        default=True)

    return parser
Пример #7
0
def common_arg_parser():
    """
    Create an argparse.ArgumentParser for run_mujoco.py.
    """
    parser = arg_parser()
    parser.add_argument('--env',
                        help='environment ID',
                        type=str,
                        default='Reacher-v2')
    parser.add_argument(
        '--env_type',
        help=
        'type of environment, used when the environment type cannot be automatically determined',
        type=str,
        default="custom")
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--alg', help='Algorithm', type=str, default='ppo2')
    parser.add_argument('--num_timesteps', type=float, default=1e6),
    parser.add_argument(
        '--network',
        help='network type (mlp, cnn, lstm, cnn_lstm, conv_only)',
        choices=['mlp', 'cnn', 'lstm', 'cnn_lstm', 'conv_only'],
        default='mlp')
    parser.add_argument(
        '--gamestate',
        help='game state to load (so far only used in retro games)',
        default=None)
    parser.add_argument(
        '--num_env',
        help=
        'Number of environment copies being run in parallel. When not specified, set to number of cpus for Atari, and to 1 for Mujoco',
        default=None,
        type=int)
    parser.add_argument('--reward_scale',
                        help='Reward scale factor. Default: 1.0',
                        default=1.0,
                        type=float)
    parser.add_argument('--save_path',
                        help='Path to save trained model to',
                        default=None,
                        type=str)
    parser.add_argument('--save_video_interval',
                        help='Save video every x steps (0 = disabled)',
                        default=1,
                        type=int)
    parser.add_argument('--save_video_length',
                        help='Length of recorded video. Default: 200',
                        default=200,
                        type=int)
    parser.add_argument('--play', default=False, action='store_true')
    return parser
def control_arg_parser():
    """
    Create an argparse.ArgumentParser for run_box2d.py.
    """
    parser = arg_parser()
    parser.add_argument('--log_dir', type=str, default='../logs')
    parser.add_argument('--env',
                        help='environment ID',
                        type=str,
                        default='LunarLanderContinuousPOMDP-v0')
    # parser.add_argument('--net_size', help='Network size', default=[64,64], type=str2list)
    # parser.add_argument('--filter_size', help='Define filter size for modified CNN policy', default=[16, 2], type=str2list)
    parser.add_argument('--hist_len',
                        help='History Length',
                        type=int,
                        default=8)
    # parser.add_argument('--block_high', help='Define the hight of shelter area, should be greater than 1/2',
    #                     default=5/8, type=frac2float)
    parser.add_argument(
        '--block_high',
        help='Define the hight of shelter area, should be greater than 1/2',
        default=3 / 4,
        type=frac2float)
    parser.add_argument('--nsteps',
                        help='timesteps each iteration',
                        type=int,
                        default=2048)
    parser.add_argument('--hid_size',
                        help='number of neurons for each hidden layer',
                        type=int,
                        default=32)
    # parser.add_argument('--batch_size', help='batch size', type=int, default=32)
    parser.add_argument('--method',
                        help='method',
                        type=str,
                        default='trpo-new-evaluation')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
    parser.add_argument('--give_state',
                        help='0:False, 1:True',
                        type=int,
                        default=1)
    # parser.add_argument('--train', help='train', default=False, type=str2bool)
    # parser.add_argument('--render', help='render', default=False, type=str2bool)
    # parser.add_argument('--load_path', default=None)
    # parser.add_argument('--checkpoint', help='Use saved checkpoint?', default=False, type=str2bool)
    # parser.add_argument('--iters', help='Iterations so far(to produce videos)', default=0)
    # parser.add_argument('--use_entr', help='Use dynammic entropy regularization term?', default=False, type=str2bool)
    return parser
def argparser():
    """
    Create an argparse.ArgumentParser.
    """
    parser = arg_parser()
    parser.add_argument('--env', help='environment ID', type=str, default='MountainCarContinuous-v0')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--num-timesteps', type=int, default=int(0.5e5))
    parser.add_argument('--timesteps_per_episode', type=int, default=int(10000))
    parser.add_argument('--n_policy', help='Number of policies to execute', type=int, default=1)
    parser.add_argument('--filepath', type=str, default='/tmp/')
    parser.add_argument('--visualize', help='Load and visualize experiment?', type=bool, default=False)
    parser.add_argument('--retrace', help='Use retrace?', type=bool, default=False)
    parser.add_argument('--trpo', help='Use TRPO instead of COPOS?', type=bool, default=False)

    return parser
Пример #10
0
def arg_parser_of_interest():
    parser = arg_parser()
    parser.add_argument(
        '--process_id',
        help='Process ID (among all hyperparameter combinations)',
        type=int,
        default=0)

    parser.add_argument('--alg', help='Algorithm', type=str, default='bgail')
    parser.add_argument('--env',
                        help='environment ID',
                        type=str,
                        default='Hopper-v1')
    parser.add_argument('--num_expert_trajs',
                        help='Number of expert trajectories for training',
                        default=25,
                        type=int)
    parser.add_argument(
        '--d_step',
        help='Number of classifier update steps for each iteration',
        default=5,
        type=int)
    parser.add_argument('--num_particles',
                        help='Number of SVGD or Ensemble classifiers',
                        default=5,
                        type=int)
    parser.add_argument('--timesteps_per_batch',
                        help='Minimum batch size for each iteration',
                        default=1000,
                        type=int)
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)

    parser.add_argument('--save_path',
                        help='Path to save trained model to',
                        default='./outputs',
                        type=str)

    parser.add_argument('--use_classifier_logsumexp',
                        help='Use classifier logsumexp or not',
                        default=True)

    return parser
Пример #11
0
def main():
    parser = arg_parser()
    parser.add_argument(
        '--flags',
        '-f',
        help="flags cfg file (will load checkpoint in save dir if found)",
        default=None)
    args = parser.parse_args()

    flags = RogueAcerFlags.from_cfg(
        args.flags) if args.flags else RogueAcerFlags()
    RogueEnv.register(flags)
    logger.configure(flags.log_dir)

    env = make_rogue_env(num_env=flags.num_env, seed=flags.seed)

    set_global_seeds(flags.seed)
    policy_fn = models.get(flags.policy)
    learn(policy_fn, env, flags)

    env.close()
Пример #12
0
def main():
    parser = arg_parser()
    parser.add_argument('--policy',
                        help='Policy architecture',
                        choices=['cnn', 'lstm', 'lnlstm'],
                        default='cnn')
    parser.add_argument('--env',
                        help='environment ID',
                        default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--N_itr', type=int, default=int(2e4))
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--lrschedule',
                        help='Learning rate schedule',
                        choices=['constant', 'linear'],
                        default='constant')
    parser.add_argument('--save_interval',
                        help='model save frequency',
                        type=int,
                        default=1000)
    parser.add_argument('--alg',
                        help='training algorithm',
                        choices=['a2c', 'ppo2'],
                        default='a2c')
    args = parser.parse_args()
    log_path = "./Data/" + args.alg + '_' + args.policy + "_" + args.env + "lr" + str(
        args.lr) + "seed" + str(args.seed)
    # log_path = "./Data/a2cTest/"
    logger.configure(dir=log_path)
    train(args.env,
          N_itr=args.N_itr,
          seed=args.seed,
          policy=args.policy,
          lr=args.lr,
          lrschedule=args.lrschedule,
          num_env=16,
          log_path=log_path,
          save_interval=args.save_interval,
          alg=args.alg)
Пример #13
0
def atari_arg_parser():
    """
    Create an argparse.ArgumentParser for run_atari.py.
    """
    parser = arg_parser()
    parser.add_argument('--env',
                        help='environment ID',
                        default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    parser.add_argument('--clipped_type', default='kl2clip', type=str)
    parser.add_argument('--use_tabular', default=False, type=ast.literal_eval)
    parser.add_argument('--cliprange', default=0.1, type=ast.literal_eval)
    parser.add_argument('--delta_kl', default=0.001, type=float)
    root_dir_default = '/tmp/baselines'
    if not os.path.exists(root_dir_default):
        tools.mkdir(root_dir_default)

    parser.add_argument('--root_dir', default=root_dir_default, type=str)
    parser.add_argument('--sub_dir', default=None, type=str)
    parser.add_argument('--force_write', default=1, type=int)
    return parser
Пример #14
0
def main():
    parser = arg_parser()
    parser.add_argument(
        '--env', help='environment ID',
        default='MiniGrid-MultiRoom-N2-S4-v0')  # MiniGrid-MultiRoom-N4-v0
    parser.add_argument('--seed', help='RNG seed', type=int, default=1)
    parser.add_argument('--num-timesteps', type=int, default=int(30e6))
    parser.add_argument('--policy',
                        help='Policy architecture',
                        choices=['cnn', 'lstm', 'lnlstm'],
                        default='cnn')
    parser.add_argument('--lrschedule',
                        help='Learning rate schedule',
                        choices=['constant', 'linear'],
                        default='constant')
    parser.add_argument('--sil-update',
                        type=int,
                        default=4,
                        help="Number of updates per iteration")
    parser.add_argument('--sil-beta',
                        type=float,
                        default=0.1,
                        help="Beta for weighted IS")
    parser.add_argument('--log', default='./log')
    parser.add_argument('--save_name',
                        default='MultiRoomN2S4_a2c',
                        help="Path for saved model")

    args = parser.parse_args()
    logger.configure(dir=args.log)
    train(args.env,
          args.save_name,
          num_timesteps=args.num_timesteps,
          seed=args.seed,
          policy=args.policy,
          lrschedule=args.lrschedule,
          sil_update=args.sil_update,
          sil_beta=args.sil_beta,
          num_env=16)
Пример #15
0
def main():
    parser = arg_parser()
    parser.add_argument('--env',
                        help='environment ID',
                        default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--num-timesteps', type=int, default=int(1e4))
    parser.add_argument('--policy',
                        help='Policy architecture',
                        choices=['lstm', 'qmdp'],
                        default='qmdp')
    parser.add_argument('--lrschedule',
                        help='Learning rate schedule',
                        choices=['constant', 'linear'],
                        default='constant')
    args = parser.parse_args()
    logger.configure(dir="./Data/a2cTest/")
    train(args.env,
          num_timesteps=args.num_timesteps,
          seed=args.seed,
          policy=args.policy,
          lrschedule=args.lrschedule,
          num_env=16)
Пример #16
0
def main():
    parser = arg_parser()
    parser.add_argument('--env',
                        help='environment ID',
                        default='carnivalRam20-v0')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--N_itr', type=int, default=int(2e4))
    parser.add_argument('--policy', help='Policy architecture', choices=['lstm16','lstm2',\
                        'qmdp','qmdp_relu','qmdp_split','qmdp_k1','qmdp_shallow','qmdp_dc','qmdp_svn'], default='qmdp')
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--lrschedule',
                        help='Learning rate schedule',
                        choices=['constant', 'linear'],
                        default='constant')
    parser.add_argument('--save_interval',
                        help='model save frequency',
                        type=int,
                        default=1000)
    parser.add_argument('--alg',
                        help='training algorithm',
                        choices=['a2c', 'ppo2'],
                        default='a2c')
    args = parser.parse_args()
    log_path = "./Data/" + args.alg + '_' + args.policy + "_" + args.env + "lr" + str(
        args.lr) + "seed" + str(args.seed) + "_ID_12345/"
    # log_path = "./Data/a2cTest/"
    logger.configure(dir=log_path)
    train(args.env,
          N_itr=args.N_itr,
          seed=args.seed,
          policy=args.policy,
          lr=args.lr,
          lrschedule=args.lrschedule,
          num_env=16,
          log_path=log_path,
          save_interval=args.save_interval,
          alg=args.alg)
Пример #17
0
def ppo_arg_parser():
    parser = arg_parser()
    parser.add_argument('--nsteps',
                        type=int,
                        default=2048,
                        help='number of steps of the vectorized '
                        'environment per update (i.e. batch'
                        ' size is nsteps * nenv where nenv'
                        ' is number of environment copies'
                        ' simulated in parallel)')
    parser.add_argument('--ent-coef',
                        '--ent_coef',
                        type=float,
                        default=0.0,
                        help='entropy coefficient')
    parser.add_argument('--lr', type=float, default=3e-4, help='learning rate')
    parser.add_argument('--vf-coef',
                        '--vf_coef',
                        type=float,
                        default=0.5,
                        help='value fn loss coefficient')
    parser.add_argument('--max-grad-norm',
                        '--max_grad_norm',
                        type=float,
                        default=0.5,
                        help='grad norm clipping scalar')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        help='discount factor')
    parser.add_argument('--lam',
                        type=float,
                        default=0.95,
                        help='advantage estimation discouting factor')
    parser.add_argument('--log-interval',
                        '--log_interval',
                        type=int,
                        default=10,
                        help='logging interval')
    parser.add_argument('--nminibatches',
                        type=int,
                        default=4,
                        help='number of training minibatches per update')
    parser.add_argument('--noptepochs',
                        type=int,
                        default=4,
                        help='number of training epochs per update')
    parser.add_argument('--cliprange',
                        type=float,
                        default=0.2,
                        help='clipping range, r schedule function [0,1] -> R+'
                        ' where 1 is beginning of the training')
    parser.add_argument('--save-interval',
                        '--save_interval',
                        type=int,
                        default=0,
                        help='number of timesteps between saving events')
    parser.add_argument('--load-path',
                        '--load_path',
                        type=str,
                        default=None,
                        help='path to load model from')
    return parser
Пример #18
0
def video(flags=RogueAcerFlags(), checkpoint_path=None, record_dir=None):
    try:
        RogueEnv.register(flags)
    except gym.error.Error:
        # an error is raised if Rogue was already registered
        pass

    options = AgentOptions(gui=True, gui_timer_ms=50, userinterface='curses')

    agent = ACER_Agent(options, flags=flags, checkpoint_path=checkpoint_path)

    if record_dir:
        agent = RecordingWrapper(agent, record_dir=record_dir)

    agent.run()


if __name__ == '__main__':
    parser = arg_parser()
    parser.add_argument('--flags', '-f', help="flags cfg file", default=None)
    parser.add_argument('--record_dir', '-r',
                        help="directory where to record frames on file (leave blank to avoid recording)",
                        default='')
    parser.add_argument('--checkpoint_path', '-c',
                        help="checkpoint file to load (without extension) N.B.: if you don't provide one an untrained model will be used")
    args = parser.parse_args()

    flags = RogueAcerFlags.from_cfg(args.flags) if args.flags else RogueAcerFlags()

    video(flags=flags, checkpoint_path=args.checkpoint_path, record_dir=args.record_dir)
Пример #19
0
def arg_parser_common():
    import ast,json
    """
    Create an argparse.ArgumentParser for run_mujoco.py.
    """
    parser = arg_parser()
    parser.add_argument('--env', help='environment ID', type=str, default='Walker2d-v2')
    parser.add_argument('--isatari', default=False, action='store_true')

    # parser.add_argument('--env', help='environment ID', type=str, default='AtlantisNoFrameskip')#TODO: tmp
    # parser.add_argument('--isatari', default=True, action='store_true')

    parser.add_argument('--seed', help='RNG seed', type=int, default=0)

    parser.add_argument('--alg', help='You can run following algorithms: pporb, trppo, trpporb, trulyppo', default='trulyppo', type=str)#

    parser.add_argument('--cliptype', default='' , type=str)#wasserstein_wassersteinrollback_constant,kl_klrollback_constant_withratio
    parser.add_argument('--clipargs', default=dict(), type=json.loads)


    # The priority of the default args is defined by the order it appears.
    # The input args has highest priority
    # The cliptype has highest priority
    args_default_all = \
        dict(__envtype=dict(
            mujoco=dict(
                policy_type = 'MlpPolicyExt',
                n_steps = 1024,
                n_envs = 2,
                n_minibatches = 32,
                n_opt_epochs = 10,
                lr = 3e-4,
                coef_entropy = 0,
                eval_interval = 1,
                num_timesteps = int(1e6),
                save_interval = 10,
                logstd = 0,
                __env = dict(
                    Humanoid = dict(
                        n_envs = 64,
                        n_minibatches = 64,
                        num_timesteps = int(20e6),
                    ),
                    HalfCheetah = dict(
                        logstd = -1.34
                    )
                ),
                __cliptype = dict(
                    ratio=dict(clipargs=dict(cliprange=0.2)),
                    ratio_rollback=dict(
                        clipargs=dict(cliprange=0.2, slope_rollback=-0.3),
                        __env = dict(
                            Humanoid=dict(
                                logstd=-1.34657,
                                clipargs=dict(cliprange=0.2, slope_rollback=-0.02)
                            )
                        )
                    ),
                    ratio_strict=dict(clipargs=dict(cliprange=0.2)),
                    ratio_rollback_constant=dict(clipargs=dict(cliprange=0.2, slope_rollback=-0.3)),

                    a2c=dict(clipargs=dict(cliprange=0.1)),

                    kl=dict(
                        clipargs=dict(klrange=0.035, cliprange=0.2),
                        __env=dict(
                            Humanoid=dict(
                                logstd=-0.5,
                                clipargs=dict(klrange=0.05, cliprange=0.2)
                            )
                        )
                    ),
                    kl_strict=dict(clipargs=dict(klrange=0.025, cliprange=0.2)),
                    kl_ratiorollback=dict(clipargs=dict(klrange=0.03, slope_rollback=-0.05, cliprange=0.2)),
                    kl_klrollback_constant=dict(clipargs=dict(klrange=0.03, slope_rollback=-0.4, cliprange=0.2)),
                    kl_klrollback_constant_withratio=dict(
                        # The common args
                        clipargs=dict(klrange=0.03, slope_rollback=-5, slope_likelihood=1, cliprange=0.2),
                        # The args for specific env
                        __env=dict(
                            Humanoid=dict(
                                logstd=-0.5,
                                clipargs=dict(klrange=0.05, slope_rollback=-0.4, slope_likelihood=0, cliprange=0.2)
                            )
                        )
                    ),
                    kl_klrollback=dict(clipargs=dict(klrange=0.03, slope_rollback=-0.1, cliprange=0.2)),

                    # klrange is used for kl2clip, which could be None. If it's None, it is adjusted by cliprange.
                    # cliprange is used for value clip, which could be None. If it's None, it is adjusted by klrange.
                    kl2clip=dict(
                        clipargs=dict(klrange=None, adjusttype='base_clip_upper', cliprange=0.2, kl2clip_opttype='tabular',
                                      adaptive_range=''),
                        __env=dict(
                            Humanoid=dict(
                                logstd=-1.34657359,
                                clipargs=dict(klrange=0.03, slope_rollback=-5, slope_likelihood=1.)
                            )
                        )
                    ),
                    kl2clip_rollback=dict(
                        clipargs=dict(klrange=None, adjusttype='base_clip_upper', cliprange=0.2, kl2clip_opttype='tabular',
                                      adaptive_range='', slope_rollback=-0.3)
                    ),

                    adaptivekl=dict(clipargs=dict(klrange=0.01, cliprange=0.2)),
                    adaptiverange_advantage=dict(clipargs=dict(cliprange_min=0, cliprange_max=0.4, cliprange=0.2)),

                    wasserstein=dict(clipargs=dict(range=0.05, cliprange=0.2)),
                    wasserstein_rollback_constant=dict(clipargs=dict(range=0.05, slope_rollback=-0.4, cliprange=0.2)),
                )
            ),
            atari=dict(
                policy_type='CnnPolicy',
                n_steps = 128 ,
                n_envs = 8,
                n_minibatches = 4,
                n_opt_epochs = 4,
                lr = 2.5e-4,
                coef_entropy= 0.01,
                eval_interval=0,
                num_timesteps=int(1e7),
                save_interval = 400,
                logstd = 0,
                __cliptype= dict(
                    ratio=dict(clipargs=dict(cliprange=0.1)),
                    ratio_rollback=dict(clipargs=dict(cliprange=0.1, slope_rollback=-0.01)),

                    a2c=dict(clipargs=dict(cliprange=0.1)),

                    kl=dict(
                        clipargs=dict(klrange=0.001, cliprange=0.1, decay_threshold=0.),
                        coef_entropy = 0
                    ),
                    kl_ratiorollback=dict(clipargs=dict(klrange=0.001,slope_rollback=-0.05, cliprange=0.1, decay_threshold=0.)),
                    kl_klrollback_constant=dict(clipargs=dict(klrange=0.001, slope_rollback=-0.05, cliprange=0.1, decay_threshold=0.)),
                    kl_klrollback_constant_withratio= dict(
                        clipargs = dict(klrange=0.0008, slope_rollback=-20, slope_likelihood=1, cliprange=0.1, decay_threshold=0.),
                        coef_entropy=0,
                    ),

                    totalvariation=dict(clipargs=dict(range=0.02, cliprange=0.1, decay_threshold=0.)),
                    totalvariation_rollback_constant=dict(
                        clipargs=dict(range=0.02, slope_rollback=-0.05, cliprange=0.1, decay_threshold=0.)
                    ),
                    kl2clip=dict(
                        clipargs=dict(klrange=0.001, cliprange=0.1, kl2clip_opttype='tabular', adaptive_range=''),
                        coef_entropy=0
                    ),
                    adaptivekl=dict(
                        clipargs=dict(klrange=0.01, cliprange=0.1)
                    ),
                )
            )
        ))

    parser.add_argument('--lam', default=0.95, type=float )
    parser.add_argument('--lr', default=None, type=float)

    parser.add_argument('--policy_type', default=None, type=str)

    parser.add_argument('--log_dir_mode', default='finish_then_exit_else_overwrite', type=str)#overwrite,finish_then_exit_else_overwrite
    parser.add_argument('--name_group', default=None, type=str)
    parser.add_argument('--keys_group', default=[], type=ast.literal_eval)

    # architecture of network
    parser.add_argument('--policy_variance_state_dependent', default=False, type=ast.literal_eval)
    parser.add_argument('--hidden_sizes', default=64, type=ast.literal_eval)
    parser.add_argument('--num_layers', default=2, type=ast.literal_eval)
    parser.add_argument('--num_sharing_layers', default=0, type=int)
    parser.add_argument('--ac_fn', default='tanh', type=str)

    parser.add_argument('--coef_predict_task', default=0, type=float)
    parser.add_argument('--reward_scale', default=1., type=float)
    parser.add_argument('--lam_decay', default=False, type=ast.literal_eval)
    # ----- Please keep the default values of the following args to be None, the default value are different for different tasks
    parser.add_argument('--coef_entropy', default=None, type=float)
    parser.add_argument('--n_envs', default=None, type=int)
    parser.add_argument('--n_steps', default=None, type=int)
    parser.add_argument('--n_minibatches', default=None, type=int)
    parser.add_argument('--n_opt_epochs', default=None, type=int)
    parser.add_argument('--logstd', default=None, type=float)

    parser.add_argument('--log_interval', default=1, type=int)
    parser.add_argument('--n_eval_epsiodes', default=1, type=int)
    parser.add_argument('--num_timesteps', type=int, default=None)
    parser.add_argument('--eval_interval', type=int, default=None)
    parser.add_argument('--save_interval', default=None, type=int)
    parser.add_argument('--save_debug', default=False, action='store_true')
    # parser.add_argument('--debug_halfcheetah', default=0, type=int)
    parser.add_argument('--is_multiprocess', default=0, type=ast.literal_eval)
    return parser, args_default_all
Пример #20
0
def common_arg_parser():
    """
    Create an argparse.ArgumentParser.
    """
    parser = arg_parser()
    parser.add_argument('--env',
                        help='environment ID',
                        type=str,
                        default='Reacher-v2')
    parser.add_argument(
        '--env_type',
        help=
        'type of environment, used when the environment type cannot be automatically determined',
        type=str)
    parser.add_argument('--seed', help='RNG seed', type=int, default=None)
    parser.add_argument('--alg', help='Algorithm', type=str, default='ppo2')
    parser.add_argument('--num_timesteps', type=float, default=1e6),
    parser.add_argument(
        '--network',
        help='network type (mlp, cnn, lstm, cnn_lstm, conv_only)',
        default=None)
    parser.add_argument(
        '--gamestate',
        help='game state to load (so far only used in retro games)',
        default=None)
    parser.add_argument(
        '--num_env',
        help=
        'Number of environment copies being run in parallel. When not specified, set to number of cpus for Atari, and to 1 for Mujoco',
        default=None,
        type=int)
    parser.add_argument('--reward_scale',
                        help='Reward scale factor. Default: 1.0',
                        default=1.0,
                        type=float)
    parser.add_argument('--save_path',
                        help='Path to save trained model to',
                        default=None,
                        type=str)
    parser.add_argument('--save_video_interval',
                        help='Save video every x steps (0 = disabled)',
                        default=0,
                        type=int)
    parser.add_argument('--save_video_length',
                        help='Length of recorded video. Default: 200',
                        default=200,
                        type=int)
    parser.add_argument('--log_path',
                        help='Directory to save learning curve data.',
                        default=None,
                        type=str)
    parser.add_argument('--play', default=False, action='store_true')
    # RM-related arguments
    parser.add_argument("--use_rs",
                        help="Use reward shaping",
                        action="store_true",
                        default=False)
    parser.add_argument("--use_crm",
                        help="Use counterfactual experience",
                        action="store_true",
                        default=False)
    parser.add_argument('--gamma',
                        help="Discount factor",
                        type=float,
                        default=0.9)
    parser.add_argument('--rs_gamma',
                        help="Discount factor used for reward shaping",
                        type=float,
                        default=0.9)
    parser.add_argument(
        '--r_min',
        help="R-min reward used for training option policies in hrm",
        type=float,
        default=0.0)
    parser.add_argument(
        '--r_max',
        help="R-max reward used for training option policies in hrm",
        type=float,
        default=1.0)
    parser.add_argument("--use_self_loops",
                        help="Add option policies for self-loops in the RMs",
                        action="store_true",
                        default=False)
    return parser
Пример #21
0
def arg_parser_common():
    import ast, json
    """
    Create an argparse.ArgumentParser for run_mujoco.py.
    """
    parser = arg_parser()
    parser.add_argument('--env',
                        help='environment ID',
                        type=str,
                        default='InvertedPendulum-v2')
    parser.add_argument('--is_atari', default=False, action='store_true')

    # parser.add_argument('--env', help='environment ID', type=str, default='PongNoFrameskip')
    # parser.add_argument('--is_atari', default=True, action='store_true')

    parser.add_argument('--seed', help='RNG seed', type=int, default=0)

    parser.add_argument('--cliptype', default='kl2clip',
                        type=str)  #wasserstein_wassersteinrollback_constant
    # parser.add_argument('--cliprange', default=0.2, type=float)
    # import demjson
    parser.add_argument('--clipargs', default=dict(), type=json.loads)

    clipargs_default_all = {
        MUJOCO:
        dict(
            ratio=dict(cliprange=0.2),
            ratio_rollback=dict(cliprange=0.2, slope_rollback=-0.3),
            ratio_strict=dict(cliprange=0.2),
            ratio_rollback_constant=dict(cliprange=0.2, slope_rollback=-0.3),
            a2c=dict(cliprange=0.1),
            wasserstein=dict(range=0.05, cliprange=0.2),
            wasserstein_wassersteinrollback_constant=dict(range=0.05,
                                                          slope_rollback=-0.4,
                                                          cliprange=0.2),
            kl=dict(klrange=0.03, cliprange=0.2),
            kl_strict=dict(klrange=0.025, cliprange=0.2),
            kl_ratiorollback=dict(klrange=0.03,
                                  slope_rollback=-0.05,
                                  cliprange=0.2),
            kl_klrollback_constant=dict(klrange=0.03,
                                        slope_rollback=-0.1,
                                        cliprange=0.2),
            kl_klrollback=dict(klrange=0.03,
                               slope_rollback=-0.1,
                               cliprange=0.2),

            # base_clip_lower, base_clip_upper
            # kl2clip = dict(klrange=0.03, adjusttype='origin', cliprange=0.2, kl2clip_opttype='tabular', adaptive_range=''),
            kl2clip=dict(klrange=None,
                         adjusttype='base_clip_upper',
                         cliprange=0.2,
                         kl2clip_opttype='tabular',
                         adaptive_range=''),
            kl2clip_rollback=dict(klrange=None,
                                  adjusttype='base_clip_upper',
                                  cliprange=0.2,
                                  kl2clip_opttype='tabular',
                                  adaptive_range='',
                                  slope_rollback=-0.3),

            # kl2clip = dict( klrange=None, adjusttype='base_clip_lower', cliprange=0.2)
            # kl2clip=dict(klrange=None, adjusttype='base_clip_upper', cliprange=0.2, kl2clip_opttype='tabular'),#nn
            # klrange is used for kl2clip, which could be None. If it's None, it is adjusted by cliprange.
            # cliprange is used for value clip, which could be None. If it's None, it is adjusted by klrange.
            adaptivekl=dict(klrange=0.01, cliprange=0.2),
            adaptiverange_advantage=dict(cliprange_min=0,
                                         cliprange_max=0.4,
                                         cliprange=0.2)),
        ATARI:
        dict(
            # TODO:!!!  Please modify the parameters here
            ratio=dict(cliprange=0.1),
            ratio_rollback=dict(cliprange=0.1, slope_rollback=-0.3),
            ratio_strict=dict(cliprange=0.1),
            ratio_rollback_constant=dict(cliprange=0.1, slope_rollback=-0.3),
            a2c=dict(cliprange=0.1),
            kl=dict(klrange=0.03, cliprange=0.1),
            kl_strict=dict(klrange=0.025, cliprange=0.1),
            kl_ratiorollback=dict(klrange=0.03,
                                  slope_rollback=-0.05,
                                  cliprange=0.1),
            kl_klrollback_constant=dict(klrange=0.03,
                                        slope_rollback=-0.1,
                                        cliprange=0.1),
            kl_klrollback=dict(klrange=0.03,
                               slope_rollback=-0.1,
                               cliprange=0.1),

            # kl2clip = dict(klrange=0.03, adjusttype='origin', cliprange=0.2)
            # kl2clip = dict( klrange=None, adjusttype='base_clip_lower', cliprange=0.2)
            kl2clip=dict(klrange=0.001,
                         cliprange=0.1,
                         kl2clip_opttype='tabular',
                         adaptive_range=''),
            # klrange is used for kl2clip, which could be None. If it's None, it is adjusted by cliprange.
            # cliprange is used for value clip, which could be None. If it's None, it is adjusted by klrange.
            adaptivekl=dict(klrange=0.01, cliprange=0.1))
    }
    # parser.add_argument('--cliptype', default='origin', type=str)
    # parser.add_argument('--slope', default=0, type=float)
    # parser.add_argument('--cliprange', default=0.2, type=ast.literal_eval)
    # parser.add_argument('--delta_kl', default=None, type=ast.literal_eval)

    parser.add_argument('--lam', default=0.95, type=float)
    parser.add_argument('--lr', default=None, type=float)

    parser.add_argument('--policy_type', default=None, type=str)

    parser.add_argument('--log_dir_mode',
                        default='finish_then_exit_else_overwrite',
                        type=str)  #overwrite,finish_then_exit_else_overwrite
    parser.add_argument('--name_group', default='tmp', type=str)
    parser.add_argument('--keys_group',
                        default=['cliptype', 'clipargs'],
                        type=ast.literal_eval)

    # architecture of network
    parser.add_argument('--policy_variance_state_dependent',
                        default=False,
                        type=ast.literal_eval)
    parser.add_argument('--hidden_sizes', default=64, type=ast.literal_eval)
    parser.add_argument('--num_layers', default=2, type=ast.literal_eval)
    parser.add_argument('--num_sharing_layers', default=0, type=int)
    parser.add_argument('--ac_fn', default='tanh', type=str)

    # parser.add_argument('--explore', default=0, type=int)
    # parser.add_argument('--explore_timesteps', default=0, type=int)
    # parser.add_argument('--explore_additive_threshold', default=None, type=float)
    # parser.add_argument('--explore_additive_rate', default=0, type=float)

    parser.add_argument('--coef_predict_task', default=0, type=float)
    parser.add_argument('--reward_scale', default=1., type=float)
    parser.add_argument('--lam_decay', default=False, type=ast.literal_eval)
    # ----- Please keep the default values of the following args to be None, the default value are different for different tasks
    parser.add_argument('--coef_entropy', default=None, type=float)
    parser.add_argument('--n_envs', default=None, type=int)
    parser.add_argument('--n_steps', default=None, type=int)
    parser.add_argument('--n_minibatches', default=None, type=int)
    parser.add_argument('--n_opt_epochs', default=None, type=int)
    parser.add_argument('--logstd', default=None, type=float)

    parser.add_argument('--log_interval', default=1, type=int)
    parser.add_argument('--n_eval_epsiodes', default=1, type=int)
    parser.add_argument('--num_timesteps', type=int, default=None)
    parser.add_argument('--eval_interval', type=int, default=None)
    parser.add_argument('--save_interval', default=None, type=int)
    parser.add_argument('--save_debug', default=False, action='store_true')
    args_default_all = \
        {
            # MUJOCO
            MUJOCO: dict(
                policy_type = dict(_default='MlpPolicyExt'),
                n_steps = dict( _default=1024  ),
                n_envs = dict( Humanoid=64, _default=2 ),
                n_minibatches = dict( Humanoid=64, _default=32 ),
                n_opt_epochs = dict(_default=10),
                lr = dict(_default=3e-4),
                coef_entropy = dict( _default=0 ),
                eval_interval = dict( _default=1 ),
                num_timesteps = dict( Humanoid=int(20e6),  _default=int(1e6) ),
                save_interval = dict( _default=10 ),
                logstd = dict( HalfCheetah=-1.34, Humanoid=-1.34657, _default=0, ),
            ),
            # ATARI
            ATARI: dict(
                policy_type=dict(_default='CnnPolicy'),
                n_steps = dict( _default=128  ),
                n_envs = dict( _default=8 ),
                n_minibatches = dict( _default=4 ),
                n_opt_epochs = dict( _default=4 ),
                lr = dict(_default=2.5e-4 ),
                coef_entropy= dict(_default=0),#TODO: tmp for kl2clip
                eval_interval=dict(_default=-1),
                num_timesteps=dict(_default=int(1e7)),
                save_interval = dict(  _default=400 ),
            )
        }
    # parser.add_argument('--debug_halfcheetah', default=0, type=int)
    parser.add_argument('--is_multiprocess', default=0, type=ast.literal_eval)
    return parser, clipargs_default_all, args_default_all