Exemplo n.º 1
0
def main():
    seed = 15

    # ---- Specifiy the version of CFB ----
    game = 'ContFlappyBird'
    ns = 'gfNS'                         # 'gfNS', 'gsNS', 'rand_feat'
    nrandfeat = ('-nrf' + str(2))       # '', 0,2,3,4
    noiselevel = ('-nl' + str(0.001))   # '', 0.0001 - 0.05 (see env/__init__.py)
    experiment_phase = '-test'          # '-test', '-train'

    # Naming convention is <game>-<non-stationarity>-nl<noise_level>-nrf<nrandfeat>-<phase>-v0
    env_name = (game + '-' + ns + noiselevel + nrandfeat + experiment_phase + '-v0')

    # ---- Generate CFB with single instance ----
    env = make_ple_env(env_name, seed=seed)
    # Run env:
    env.seed(seed=seed)
    env.reset()
    for i in range(100):
        state, reward, done, info = env.step(action=np.random.randint(len(env.action_space)+1))
        if RENDER:
            env.render()

    # ---- Generate CFB with N parallel instances. ----
    N = 3
    env = make_ple_envs(env_name, num_env=N, seed=seed)
    # Run env:
    env.seed(seed=seed)
    env.reset()
    for i in range(100):
        state, reward, done, info = env.step(action=np.random.randint(len(env.action_space)+1))
        if RENDER:
            env[0].render()
Exemplo n.º 2
0
    logger.info(
        'Total number of finished episodes during training: sum(%s) = %s' %
        (runner.ep_idx, sum(runner.ep_idx)))
    logger.info('Total number of parameter updates during training: %s' %
                i_train)
    logger.info('*******************************************************\n')

    return breaked


from run_ple_utils import make_ple_envs, make_ple_env
from models import MLPPolicy, LSTMPolicy, GRUPolicy
if __name__ == '__main__':
    seed = 1
    env = make_ple_envs('ContFlappyBird-hNS-nrf0-train-v0',
                        num_env=1,
                        seed=seed)
    test_env = make_ple_env('ContFlappyBird-v3', seed=seed)
    logger = logging.getLogger()
    ch = logging.StreamHandler()  # Handler which writes to stderr (in red)
    ch.setLevel(logging.INFO)
    ch.setFormatter(logging.Formatter('%(levelname)s:%(name)s: %(message)s'))
    logger.addHandler(ch)
    logger.setLevel(logging.INFO)

    BATCH_SIZE = 64

    # SMAC config 1
    ACTIV_FCN = 'mixed'
    DISCOUNT = 0.94
    ENT_COEFF = 0.000036
Exemplo n.º 3
0
def run_ppo_smac(**kwargs):
    params = ppo_params_parser(**kwargs)

    # logger = logging.getLogger(__name__)
    # logger.propagate = False  # no duplicate logging outputs
    # fh = logging.FileHandler(os.path.join(params["logdir"], 'run.log'))
    # fh.setLevel(logging.INFO)
    # fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s'))
    # logger.addHandler(fh)

    seed = params["seed"]
    ple_env = make_ple_envs(params["env"], num_env=params["nenvs"], seed=seed)
    test_env = make_ple_env(params["test_env"], seed=3000)

    if params["architecture"] == 'ff':
        policy_fn = LargerMLPPolicy
    elif params["architecture"] == 'lstm':
        policy_fn = LargerLSTMPolicy
    elif params["architecture"] == 'gru':
        policy_fn = GRUPolicy
    else:
        print('Policy option %s is not implemented yet.' % params["policy"])

    with open(os.path.join(params["logdir"], 'hyperparams.txt'), 'a') as f:
        for k, v in params.items():
            f.write(k + ': ' + str(v) + '\n')
    print(params)

    early_stopped = learn(policy_fn,
                          env=ple_env,
                          test_env=test_env,
                          seed=seed,
                          total_timesteps=params["total_timesteps"],
                          log_interval=params["log_interval"],
                          test_interval=params["test_interval"],
                          show_interval=params["show_interval"],
                          logdir=params["logdir"],
                          lr=params["lr"],
                          # lrschedule=params["lrschedule"],
                          max_grad_norm=params["max_grad_norm"],
                          units_per_hlayer=(params["units_shared_layer1"],
                                            params["units_shared_layer2"],
                                            params["units_policy_layer"]),
                          activ_fcn=params["activ_fcn"],
                          gamma=params["gamma"],
                          vf_coef=params["vf_coeff"],
                          ent_coef=params["ent_coeff"],
                          nsteps=params["nsteps"],
                          lam=params["lam"],
                          nminibatches=params["nminibatches"],
                          noptepochs=params["noptepochs"],
                          cliprange=params["cliprange"],
                          early_stop=params["early_stop"],
                          keep_model=params["keep_model"])
    ple_env.close()

    if not early_stopped:
        avg_perf, var_perf, max_return = eval_model(render=False, nepisodes=10, test_steps=3000, **params)

        with open(os.path.join(params["logdir"], 'hyperparams.txt'), 'a') as f:
            f.write('\n')
            f.write('Results: \n')
            f.write('average performance: ' + str(avg_perf) + '\n')
            f.write('performance variance: ' + str(var_perf) + '\n')
            f.write('maximum return: ' + str(max_return) + '\n')

        return avg_perf, var_perf, max_return
    else:
        return -3000, 3000, -3000
Exemplo n.º 4
0
def main():
    parser = arg_parser()
    parser.add_argument('--early_stop', help='stop bad performing runs ealier', type=bool, default=False)
    parser.add_argument('--nenvs', help='Number of parallel simulation environmenrs', type=int, default=4)
    parser.add_argument('--activ_fcn', choices=['relu6', 'elu', 'mixed'], type=str, default='elu',
                        help='Activation functions of network layers', )
    parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4)
    parser.add_argument('--nsteps', type=int, default=32, help='number of samples based on which gradient is updated')
    parser.add_argument('--nminibatches', help='Number of minibatches per sampled data batch.', type=int, default=1)
    parser.add_argument('--noptepochs', help='Number of optimization epochs with sample data, i.e. how often samples are reused.', type=int, default=1)

    parser.add_argument('--lam', help='Lambda parameter for GAE', type=float, default=0.95)
    parser.add_argument('--cliprange', help='Defines the maximum policy change allowed, before clipping.', type=float, default=0.2)
    parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90)
    parser.add_argument('--vf_coeff', help='Weight of value function loss in total loss', type=float, default=0.2)
    parser.add_argument('--ent_coeff', help='Weight of entropy in total loss', type=float, default=1e-7)
    parser.add_argument('--units_shared_layer1', help='Units in first hidden layer which is shared', type=int, default=64)
    parser.add_argument('--units_shared_layer2', help='Units in second hidden layer which is shared', type=int, default=64)
    parser.add_argument('--units_policy_layer', help='Units in hidden layer in policy head', type=int, default=64)

    parser.add_argument('--restore_model', help='whether a pretrained model shall be restored', type=bool, default=False)
    args = parser.parse_args()

    seed = args.seed
    env = make_ple_envs(args.env, num_env=args.nenvs, seed=seed*10)
    # env = make_ple_envs('ContFlappyBird-hNS-nrf2-train-v0', num_env=args.nenvs, seed=seed - 1)
    test_env = make_ple_env(args.test_env, seed=3000)

    if args.architecture == 'ff':
        policy_fn = LargerMLPPolicy
    elif args.architecture == 'lstm':
        policy_fn = LargerLSTMPolicy
    elif args.architecture == 'gru':
        policy_fn = GRUPolicy
    else:
        print('Policy option %s is not implemented yet.' % args.policy)

    # store hyperparms setting
    # logdir = os.path.join(args.logdir, str(datetime.datetime.today()))
    # os.makedirs(logdir)

    ppo_output_dir = os.path.join(args.logdir, ('ppo_output'+str(args.seed)))
    if not os.path.isdir(ppo_output_dir):
        os.makedirs(ppo_output_dir)

    with open(os.path.join(ppo_output_dir, 'hyperparams.txt'), 'a') as f:
        for k,v in vars(args).items():
            f.write(k + ': ' + str(v) + '\n')

    logger = logging.getLogger()
    fh = logging.FileHandler(os.path.join(ppo_output_dir, 'algo.log'))
    fh.setLevel(logging.INFO)
    fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s'))
    logger.addHandler(fh)
    logger.setLevel(logging.INFO)
    logger.propagate = False

    early_stopped = learn(policy_fn,
                          env=env,
                          test_env=test_env,
                          seed=seed,
                          total_timesteps=args.total_timesteps,
                          log_interval=args.log_interval,
                          test_interval=args.test_interval,
                          show_interval=args.show_interval,
                          logdir=ppo_output_dir,
                          lr=args.lr,
                          # lrschedule=args.lrschedule,
                          max_grad_norm=args.max_grad_norm,
                          units_per_hlayer=(args.units_shared_layer1,
                                            args.units_shared_layer2,
                                            args.units_policy_layer),
                          activ_fcn=args.activ_fcn,
                          gamma=args.gamma,
                          vf_coef=args.vf_coeff,
                          ent_coef=args.ent_coeff,
                          nsteps=args.nsteps,
                          lam=args.lam,
                          nminibatches=args.nminibatches,
                          noptepochs=args.noptepochs,
                          cliprange=args.cliprange,
                          early_stop=args.early_stop,
                          keep_model=args.keep_model,
                          restore_model=args.restore_model)
    env.close()
Exemplo n.º 5
0
def main():
    parser = arg_parser()
    parser.add_argument('--early_stop',
                        help='stop bad performing runs ealier',
                        type=bool,
                        default=False)
    parser.add_argument('--nenvs',
                        help='Number of parallel simulation environmenrs',
                        type=int,
                        default=1)
    parser.add_argument(
        '--activ_fcn',
        choices=['relu6', 'elu', 'mixed'],
        type=str,
        default='relu6',
        help='Activation functions of network layers',
    )
    parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4)
    parser.add_argument(
        '--batch_size',
        type=int,
        default=50,
        help='number of samples based on which gradient is updated',
    )
    parser.add_argument('--gamma',
                        help='Discount factor for discounting the reward',
                        type=float,
                        default=0.90)
    parser.add_argument('--vf_coeff',
                        help='Weight of value function loss in total loss',
                        type=float,
                        default=0.2)
    parser.add_argument('--ent_coeff',
                        help='Weight of entropy in total loss',
                        type=float,
                        default=1e-7)
    parser.add_argument('--units_shared_layer1',
                        help='Units in first hidden layer which is shared',
                        type=int,
                        default=64)
    parser.add_argument('--units_shared_layer2',
                        help='Units in second hidden layer which is shared',
                        type=int,
                        default=64)
    parser.add_argument('--units_policy_layer',
                        help='Units in hidden layer in policy head',
                        type=int,
                        default=64)
    args = parser.parse_args()

    seed = args.seed
    env = make_ple_envs(args.env, num_env=args.nenvs, seed=seed - 1)
    test_env = make_ple_env(args.test_env, seed=100 + (seed - 1))

    if args.architecture == 'ff':
        policy_fn = MLPPolicy
    elif args.architecture == 'lstm':
        policy_fn = LSTMPolicy
    elif args.architecture == 'gru':
        policy_fn = GRUPolicy
    else:
        print('Policy option %s is not implemented yet.' % args.policy)

    a2c_output_dir = os.path.join(args.logdir, ('a2c_output' + str(args.seed)))
    if not os.path.isdir(a2c_output_dir):
        os.makedirs(a2c_output_dir)

    with open(os.path.join(a2c_output_dir, 'hyperparams.txt'), 'a') as f:
        for k, v in vars(args).items():
            f.write(k + ': ' + str(v) + '\n')

    logger = logging.getLogger()
    fh = logging.FileHandler(os.path.join(a2c_output_dir, 'algo.log'))
    fh.setLevel(logging.INFO)
    fh.setFormatter(
        logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s'))
    logger.addHandler(fh)
    logger.setLevel(logging.INFO)
    logger.propagate = False

    early_stopped = learn(
        policy_fn,
        env=env,
        test_env=test_env,
        seed=seed,
        total_timesteps=args.total_timesteps,
        log_interval=args.log_interval,
        test_interval=args.test_interval,
        show_interval=args.show_interval,
        logdir=a2c_output_dir,
        lr=args.lr,
        # lrschedule=args.lrschedule,
        max_grad_norm=args.max_grad_norm,
        units_per_hlayer=(args.units_shared_layer1, args.units_shared_layer2,
                          args.units_policy_layer),
        activ_fcn=args.activ_fcn,
        gamma=args.gamma,
        vf_coef=args.vf_coeff,
        ent_coef=args.ent_coeff,
        batch_size=args.batch_size,
        early_stop=args.early_stop,
        keep_model=args.keep_model)
    env.close()
Exemplo n.º 6
0
def main():
    parser = arg_parser()
    parser.add_argument('--early_stop', help='stop bad performing runs ealier', type=bool, default=False)
    parser.add_argument('--nenvs', help='Number of parallel simulation environmenrs', type=int, default=1)
    parser.add_argument('--activ_fcn', choices=['relu6', 'elu', 'mixed'], type=str, default='mixed',
                        help='Activation functions of network layers', )
    parser.add_argument('--lr', help='Learning Rate', type=float, default=0.001)
    parser.add_argument('--nsteps', type=int, default=32, help='number of samples based on which gradient is updated')
    parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90)
    parser.add_argument('--vf_coeff', help='Weight of value function loss in total loss', type=float, default=0.2)
    parser.add_argument('--ent_coeff', help='Weight of entropy in total loss', type=float, default=7e-5)
    parser.add_argument('--units_shared_layer1', help='Units in first hidden layer which is shared', type=int,
                        default=28)
    parser.add_argument('--units_shared_layer2', help='Units in second hidden layer which is shared', type=int,
                        default=59)
    parser.add_argument('--units_policy_layer', help='Units in hidden layer in policy head', type=int, default=21)

    # PPO args
    parser.add_argument('--nminibatches', help='Number of minibatches per sampled data batch.', type=int, default=2)
    parser.add_argument('--noptepochs',
                        help='Number of optimization epochs with sample data, i.e. how often samples are reused.',
                        type=int, default=4)

    parser.add_argument('--lam', help='Lambda parameter for GAE', type=float, default=0.95)
    parser.add_argument('--cliprange', help='Defines the maximum policy change allowed, before clipping.', type=float,
                        default=0.2)

    # MAML args
    parser.add_argument('--K', help='length of each rollout (=trajectory)', type=int, default=20) # Test how well it works with other measures.
    parser.add_argument('--train_batchsz', help='number of rollouts per adaptation/training update (=fast update)', type=int, default=1)
    parser.add_argument('--kshot', help='number of adaptation/training update (=fast updates) per task between two meta updates', type=int, default=1000)
    parser.add_argument('--test_batchsz', help='number of rollouts with updated model on which test_loss is computed',
                        type=int, default=1)
    parser.add_argument('--meta_batchsz', help='number of sampled tasks per meta update', type=int, default=4)  # parallely or sequentially
    parser.add_argument('--test_stage', help='whether or not meta learner is in test_stage', type=bool, default=False)

    parser.add_argument('--base_agent', help='type of base learning agent, i.e. A2C or PPO agent', type=str, default='ppo')
    args = parser.parse_args()
    print(args)

    ple_env = make_ple_envs(args.env, args.nenvs, seed=args.seed-1)
    ple_test_env = make_ple_env(args.test_env, seed=100 + (args.seed-1))

    if args.architecture == 'ff':
        policy_fn = LargerMLPPolicy
    elif args.architecture == 'lstm':
        policy_fn = LargerLSTMPolicy
    elif args.architecture == 'gru':
        policy_fn = GRUPolicy
    else:
        print('Policy option %s is not implemented yet.' % args.policy)

    output_dir = os.path.join(args.logdir, ('a2c_output'+str(args.seed)))
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    with open(os.path.join(output_dir, 'hyperparams.txt'), 'a') as f:
        for k,v in vars(args).items():
            f.write(k + ': ' + str(v) + '\n')

    logger = logging.getLogger()
    fh = logging.FileHandler(os.path.join(output_dir, 'algo.log'))
    fh.setLevel(logging.INFO)
    fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s'))
    logger.addHandler(fh)
    logger.setLevel(logging.INFO)
    logger.propagate = False

    # if not args.test_stage:  # construct training model
    #     pass
    args.env = ple_env
    args.test_env = ple_test_env
    args.logdir = output_dir
    args.units_per_hlayer=(args.units_shared_layer1,
                           args.units_shared_layer2,
                           args.units_policy_layer)
    args.policy = policy_fn

    args.total_timesteps = 200000

    meta_learn(**args.__dict__)
    ple_env.close()
Exemplo n.º 7
0
def main():
    seed = 42

    # ---- Specifiy the version of CFB ----
    game = 'ContFlappyBird'
    ns = ''  # '', 'gfNS', 'gsNS', 'rand_feat'
    nrandfeat = ('-nrf' + str(2))  # '', 0,2,3,4
    noiselevel = ('-nl' + str(0.001)
                  )  # '', 0.0001 - 0.05 (see env/__init__.py)
    experiment_phase = '-train'  # '-test', '-train'

    # Naming convention is <game>-<non-stationarity>-nl<noise_level>-nrf<nrandfeat>-<phase>-v0
    env_name = (game + ns + noiselevel + nrandfeat + experiment_phase + '-v0')
    test_env_name = (game + ns + noiselevel + nrandfeat + '-test' + '-v0')

    # ---- Generate CFB with N parallel instances and with single instance ----
    ple_env = make_ple_envs(env_name, num_env=2,
                            seed=seed)  # N parallel instances
    test_env = make_ple_env(test_env_name, seed=seed + 42)  # single instance

    # ---- Import the RL method you want to use ----
    from A2C.a2c import learn
    # from PPO.ppo import learn
    # from DQN.dqn import q_learning

    # ---- Specify the model (FF, LSTM, GRU) ----
    model_architecture = 'ff'  # 'lstm', 'gru'

    if model_architecture == 'ff':
        policy_fn = MLPPolicy
    elif model_architecture == 'lstm':
        policy_fn = LSTMPolicy
    elif model_architecture == 'gru':
        policy_fn = GRUPolicy
    else:
        print('Policy option %s is not implemented yet.' % model_architecture)

    # ---- Learn an optimal policy. The agents model ('final_model...') is stored in LOGDIR.
    early_stopped = learn(
        policy_fn,
        env=ple_env,
        test_env=test_env,
        seed=seed,
        total_timesteps=int(2e4),  # Total number of env steps
        log_interval=
        0,  # Network parameter values are stored in tensorboard summary every <log_interval> model update step. 0 --> no logging
        test_interval=
        0,  # Model is evaluated after <test_interval> model updates. 0 = do not evaluate while learning.
        show_interval=0,  # Env is rendered every n-th episode. 0 = no rendering
        logdir=LOGDIR,  # directory where logs and the learned models are stored
        lr=5e-4,  # Learning Rate
        max_grad_norm=
        0.01,  # Maximum gradient norm up to which gradient is not clipped
        units_per_hlayer=(64, 64, 64),  # Number of units per network layer
        activ_fcn=
        'relu6',  # Type of activation function used in the network: 'relu6', 'elu', 'mixed'
        gamma=0.95,  # Discount factor for discounting the reward
        vf_coef=0.2,  # Weight on the value function loss in the loss function
        ent_coef=1e-7,  # Weight on the policy entropy in the loss function
        batch_size=64,  # number of samples based on which gradient is updated
        early_stop=False,  # whether or not to stop bad performing runs earlier.
        keep_model=0
    )  # How many best models shall be kept during training. 0 -> only final model
    ple_env.close()