def dqn_arg_parser():
    parser = arg_parser()
    parser.add_argument('--gamma',
                        help='Discount factor for discounting the reward',
                        type=float,
                        default=0.90)
    parser.add_argument('--tau',
                        help='Update rate of target network',
                        type=float,
                        default=0.99)
    parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4)
    parser.add_argument('--lrschedule',
                        help='Learning Rate Decay Schedule',
                        choices=['constant', 'linear', 'double_linear_con'],
                        default='constant')
    parser.add_argument(
        '--nbatch',
        help=
        'Batch size. Number of sampless drawn from buffer, which are used to update the model.',
        type=int,
        default=3)
    parser.add_argument('--buffer_size',
                        help='Replay buffer size',
                        type=int,
                        default=5000)
    parser.add_argument(
        '--trace_length',
        help='Length of the traces obtained from the batched episodes',
        type=int,
        default=8)
    parser.add_argument(
        '--max_grad_norm',
        help='Maximum gradient norm up to which gradient is not clipped',
        type=float,
        default=0.01)
    parser.add_argument(
        '--update_interval',
        type=int,
        default=5,
        help=
        'Frequency with which the network model is updated based on minibatch data.'
    )
    return parser.parse_args()
示例#2
0
def main():
    parser = arg_parser()
    # parser = arg_parser()
    parser.add_argument('--gamma',
                        help='Discount factor for discounting the reward',
                        type=float,
                        default=0.90)
    parser.add_argument('--epsilon',
                        help='Epsilon for epsilon-greedy policy',
                        type=float,
                        default=0.5)
    parser.add_argument('--epsilon_decay',
                        help='Epsilon decay rate',
                        type=float,
                        default=0.995)
    parser.add_argument('--tau',
                        help='Update rate of target netowrk',
                        type=float,
                        default=0.99)
    parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4)
    parser.add_argument('--lrschedule',
                        help='Learning Rate Decay Schedule',
                        choices=['constant', 'linear', 'double_linear_con'],
                        default='constant')
    parser.add_argument(
        '--nbatch',
        help=
        'Batch size. Number of sampless drawn from buffer, which are used to update the model.',
        type=int,
        default=3)
    parser.add_argument('--buffer_size',
                        help='Replay buffer size',
                        type=int,
                        default=10)
    parser.add_argument(
        '--trace_length',
        help='Length of the traces obtained from the batched episodes',
        type=int,
        default=8)
    parser.add_argument(
        '--max_grad_norm',
        help='Maximum gradient norm up to which gradient is not clipped',
        type=float,
        default=0.01)
    parser.add_argument('--units_layer1',
                        help='Units in first hidden layer',
                        type=int,
                        default=64)
    parser.add_argument('--units_layer2',
                        help='Units in second hidden layer',
                        type=int,
                        default=64)
    parser.add_argument('--units_layer3',
                        help='Units in third hidden layer',
                        type=int,
                        default=64)
    parser.add_argument(
        '--update_interval',
        type=int,
        default=5,
        help=
        'Frequency with which the network model is updated based on minibatch data.'
    )
    # parser.add_argument('--log_interval', help='parameter values stored in tensorboard summary every <log_interval> model update step. 0 --> no logging ', type=int, default=30)
    # parser.add_argument('--show_interval', help='Env is rendered every n-th episode. 0 = no rendering', type=int, default=30)
    # parser.add_argument('--logdir', help='directory where logs are stored', default='/home/mara/Desktop/logs/A2C_OAI_NENVS')  # '/mnt/logs/A2C')
    args = parser.parse_args()

    seed = args.seed
    env = make_ple_env(args.env, seed=seed)
    test_env = make_ple_env(args.env, seed=seed)

    # logdir = os.path.join(args.logdir, str(datetime.datetime.today()))
    # os.makedirs(logdir)

    dqn_output_dir = os.path.join(args.logdir,
                                  ('dqn_rnn_output' + str(args.seed)))
    if not os.path.isdir(dqn_output_dir):  # TODO check what this does
        os.makedirs(dqn_output_dir)

    # store hyperparms setting
    with open(os.path.join(dqn_output_dir, 'hyperparams.txt'), 'a') as f:
        for k, v in vars(args).items():
            f.write(k + ': ' + str(v) + '\n')

    logger = logging.getLogger(
    )  # TODO setup root logger is necessary to use FIleHandler
    logger.propagate = False
    fh = logging.FileHandler(os.path.join(dqn_output_dir, 'algo.log'))
    fh.setLevel(logging.INFO)
    fh.setFormatter(
        logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s'))
    logger.addHandler(fh)
    logger.setLevel(logging.INFO)

    q_learning(env,
               test_env=test_env,
               seed=seed,
               total_timesteps=args.total_timesteps,
               gamma=args.gamma,
               epsilon=args.epsilon,
               epsilon_decay=args.epsilon_decay,
               tau=args.tau,
               lr=args.lr,
               lrschedule=args.lrschedule,
               buffer_size=args.buffer_size,
               nbatch=args.nbatch,
               trace_length=args.trace_length,
               max_grad_norm=args.max_grad_norm,
               units_per_hlayer=(args.units_layer1, args.units_layer2,
                                 args.units_layer3),
               update_interval=args.update_interval,
               log_interval=args.log_interval,
               test_interval=args.test_interval,
               show_interval=args.show_interval,
               logdir=dqn_output_dir,
               keep_model=args.keep_model)
    env.close()

    args.logdir = dqn_output_dir
    avg_perf, var_perf, max_return = eval_model(render=False,
                                                nepisodes=15,
                                                **args.__dict__)

    with open(os.path.join(args.logdir, 'hyperparams.txt'), 'a') as f:
        f.write('\n')
        f.write('Results: \n')
        f.write('average performance: ' + str(avg_perf) + '\n')
        f.write('performance variance: ' + str(var_perf) + '\n')
        f.write('maximum return: ' + str(max_return) + '\n')
def main():
    parser = arg_parser()
    parser.add_argument('--early_stop', help='stop bad performing runs ealier', type=bool, default=False)
    parser.add_argument('--nenvs', help='Number of parallel simulation environmenrs', type=int, default=4)
    parser.add_argument('--activ_fcn', choices=['relu6', 'elu', 'mixed'], type=str, default='elu',
                        help='Activation functions of network layers', )
    parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4)
    parser.add_argument('--nsteps', type=int, default=32, help='number of samples based on which gradient is updated')
    parser.add_argument('--nminibatches', help='Number of minibatches per sampled data batch.', type=int, default=1)
    parser.add_argument('--noptepochs', help='Number of optimization epochs with sample data, i.e. how often samples are reused.', type=int, default=1)

    parser.add_argument('--lam', help='Lambda parameter for GAE', type=float, default=0.95)
    parser.add_argument('--cliprange', help='Defines the maximum policy change allowed, before clipping.', type=float, default=0.2)
    parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90)
    parser.add_argument('--vf_coeff', help='Weight of value function loss in total loss', type=float, default=0.2)
    parser.add_argument('--ent_coeff', help='Weight of entropy in total loss', type=float, default=1e-7)
    parser.add_argument('--units_shared_layer1', help='Units in first hidden layer which is shared', type=int, default=64)
    parser.add_argument('--units_shared_layer2', help='Units in second hidden layer which is shared', type=int, default=64)
    parser.add_argument('--units_policy_layer', help='Units in hidden layer in policy head', type=int, default=64)

    parser.add_argument('--restore_model', help='whether a pretrained model shall be restored', type=bool, default=False)
    args = parser.parse_args()

    seed = args.seed
    env = make_ple_envs(args.env, num_env=args.nenvs, seed=seed*10)
    # env = make_ple_envs('ContFlappyBird-hNS-nrf2-train-v0', num_env=args.nenvs, seed=seed - 1)
    test_env = make_ple_env(args.test_env, seed=3000)

    if args.architecture == 'ff':
        policy_fn = LargerMLPPolicy
    elif args.architecture == 'lstm':
        policy_fn = LargerLSTMPolicy
    elif args.architecture == 'gru':
        policy_fn = GRUPolicy
    else:
        print('Policy option %s is not implemented yet.' % args.policy)

    # store hyperparms setting
    # logdir = os.path.join(args.logdir, str(datetime.datetime.today()))
    # os.makedirs(logdir)

    ppo_output_dir = os.path.join(args.logdir, ('ppo_output'+str(args.seed)))
    if not os.path.isdir(ppo_output_dir):
        os.makedirs(ppo_output_dir)

    with open(os.path.join(ppo_output_dir, 'hyperparams.txt'), 'a') as f:
        for k,v in vars(args).items():
            f.write(k + ': ' + str(v) + '\n')

    logger = logging.getLogger()
    fh = logging.FileHandler(os.path.join(ppo_output_dir, 'algo.log'))
    fh.setLevel(logging.INFO)
    fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s'))
    logger.addHandler(fh)
    logger.setLevel(logging.INFO)
    logger.propagate = False

    early_stopped = learn(policy_fn,
                          env=env,
                          test_env=test_env,
                          seed=seed,
                          total_timesteps=args.total_timesteps,
                          log_interval=args.log_interval,
                          test_interval=args.test_interval,
                          show_interval=args.show_interval,
                          logdir=ppo_output_dir,
                          lr=args.lr,
                          # lrschedule=args.lrschedule,
                          max_grad_norm=args.max_grad_norm,
                          units_per_hlayer=(args.units_shared_layer1,
                                            args.units_shared_layer2,
                                            args.units_policy_layer),
                          activ_fcn=args.activ_fcn,
                          gamma=args.gamma,
                          vf_coef=args.vf_coeff,
                          ent_coef=args.ent_coeff,
                          nsteps=args.nsteps,
                          lam=args.lam,
                          nminibatches=args.nminibatches,
                          noptepochs=args.noptepochs,
                          cliprange=args.cliprange,
                          early_stop=args.early_stop,
                          keep_model=args.keep_model,
                          restore_model=args.restore_model)
    env.close()
示例#4
0
def main():
    parser = arg_parser()
    parser.add_argument('--early_stop',
                        help='stop bad performing runs ealier',
                        type=bool,
                        default=False)
    parser.add_argument('--nenvs',
                        help='Number of parallel simulation environmenrs',
                        type=int,
                        default=1)
    parser.add_argument(
        '--activ_fcn',
        choices=['relu6', 'elu', 'mixed'],
        type=str,
        default='relu6',
        help='Activation functions of network layers',
    )
    parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4)
    parser.add_argument(
        '--batch_size',
        type=int,
        default=50,
        help='number of samples based on which gradient is updated',
    )
    parser.add_argument('--gamma',
                        help='Discount factor for discounting the reward',
                        type=float,
                        default=0.90)
    parser.add_argument('--vf_coeff',
                        help='Weight of value function loss in total loss',
                        type=float,
                        default=0.2)
    parser.add_argument('--ent_coeff',
                        help='Weight of entropy in total loss',
                        type=float,
                        default=1e-7)
    parser.add_argument('--units_shared_layer1',
                        help='Units in first hidden layer which is shared',
                        type=int,
                        default=64)
    parser.add_argument('--units_shared_layer2',
                        help='Units in second hidden layer which is shared',
                        type=int,
                        default=64)
    parser.add_argument('--units_policy_layer',
                        help='Units in hidden layer in policy head',
                        type=int,
                        default=64)
    args = parser.parse_args()

    seed = args.seed
    env = make_ple_envs(args.env, num_env=args.nenvs, seed=seed - 1)
    test_env = make_ple_env(args.test_env, seed=100 + (seed - 1))

    if args.architecture == 'ff':
        policy_fn = MLPPolicy
    elif args.architecture == 'lstm':
        policy_fn = LSTMPolicy
    elif args.architecture == 'gru':
        policy_fn = GRUPolicy
    else:
        print('Policy option %s is not implemented yet.' % args.policy)

    a2c_output_dir = os.path.join(args.logdir, ('a2c_output' + str(args.seed)))
    if not os.path.isdir(a2c_output_dir):
        os.makedirs(a2c_output_dir)

    with open(os.path.join(a2c_output_dir, 'hyperparams.txt'), 'a') as f:
        for k, v in vars(args).items():
            f.write(k + ': ' + str(v) + '\n')

    logger = logging.getLogger()
    fh = logging.FileHandler(os.path.join(a2c_output_dir, 'algo.log'))
    fh.setLevel(logging.INFO)
    fh.setFormatter(
        logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s'))
    logger.addHandler(fh)
    logger.setLevel(logging.INFO)
    logger.propagate = False

    early_stopped = learn(
        policy_fn,
        env=env,
        test_env=test_env,
        seed=seed,
        total_timesteps=args.total_timesteps,
        log_interval=args.log_interval,
        test_interval=args.test_interval,
        show_interval=args.show_interval,
        logdir=a2c_output_dir,
        lr=args.lr,
        # lrschedule=args.lrschedule,
        max_grad_norm=args.max_grad_norm,
        units_per_hlayer=(args.units_shared_layer1, args.units_shared_layer2,
                          args.units_policy_layer),
        activ_fcn=args.activ_fcn,
        gamma=args.gamma,
        vf_coef=args.vf_coeff,
        ent_coef=args.ent_coeff,
        batch_size=args.batch_size,
        early_stop=args.early_stop,
        keep_model=args.keep_model)
    env.close()
def main():
    parser = arg_parser()
    parser.add_argument('--early_stop', help='stop bad performing runs ealier', type=bool, default=False)
    parser.add_argument('--nenvs', help='Number of parallel simulation environmenrs', type=int, default=1)
    parser.add_argument('--activ_fcn', choices=['relu6', 'elu', 'mixed'], type=str, default='mixed',
                        help='Activation functions of network layers', )
    parser.add_argument('--lr', help='Learning Rate', type=float, default=0.001)
    parser.add_argument('--nsteps', type=int, default=32, help='number of samples based on which gradient is updated')
    parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90)
    parser.add_argument('--vf_coeff', help='Weight of value function loss in total loss', type=float, default=0.2)
    parser.add_argument('--ent_coeff', help='Weight of entropy in total loss', type=float, default=7e-5)
    parser.add_argument('--units_shared_layer1', help='Units in first hidden layer which is shared', type=int,
                        default=28)
    parser.add_argument('--units_shared_layer2', help='Units in second hidden layer which is shared', type=int,
                        default=59)
    parser.add_argument('--units_policy_layer', help='Units in hidden layer in policy head', type=int, default=21)

    # PPO args
    parser.add_argument('--nminibatches', help='Number of minibatches per sampled data batch.', type=int, default=2)
    parser.add_argument('--noptepochs',
                        help='Number of optimization epochs with sample data, i.e. how often samples are reused.',
                        type=int, default=4)

    parser.add_argument('--lam', help='Lambda parameter for GAE', type=float, default=0.95)
    parser.add_argument('--cliprange', help='Defines the maximum policy change allowed, before clipping.', type=float,
                        default=0.2)

    # MAML args
    parser.add_argument('--K', help='length of each rollout (=trajectory)', type=int, default=20) # Test how well it works with other measures.
    parser.add_argument('--train_batchsz', help='number of rollouts per adaptation/training update (=fast update)', type=int, default=1)
    parser.add_argument('--kshot', help='number of adaptation/training update (=fast updates) per task between two meta updates', type=int, default=1000)
    parser.add_argument('--test_batchsz', help='number of rollouts with updated model on which test_loss is computed',
                        type=int, default=1)
    parser.add_argument('--meta_batchsz', help='number of sampled tasks per meta update', type=int, default=4)  # parallely or sequentially
    parser.add_argument('--test_stage', help='whether or not meta learner is in test_stage', type=bool, default=False)

    parser.add_argument('--base_agent', help='type of base learning agent, i.e. A2C or PPO agent', type=str, default='ppo')
    args = parser.parse_args()
    print(args)

    ple_env = make_ple_envs(args.env, args.nenvs, seed=args.seed-1)
    ple_test_env = make_ple_env(args.test_env, seed=100 + (args.seed-1))

    if args.architecture == 'ff':
        policy_fn = LargerMLPPolicy
    elif args.architecture == 'lstm':
        policy_fn = LargerLSTMPolicy
    elif args.architecture == 'gru':
        policy_fn = GRUPolicy
    else:
        print('Policy option %s is not implemented yet.' % args.policy)

    output_dir = os.path.join(args.logdir, ('a2c_output'+str(args.seed)))
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    with open(os.path.join(output_dir, 'hyperparams.txt'), 'a') as f:
        for k,v in vars(args).items():
            f.write(k + ': ' + str(v) + '\n')

    logger = logging.getLogger()
    fh = logging.FileHandler(os.path.join(output_dir, 'algo.log'))
    fh.setLevel(logging.INFO)
    fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s'))
    logger.addHandler(fh)
    logger.setLevel(logging.INFO)
    logger.propagate = False

    # if not args.test_stage:  # construct training model
    #     pass
    args.env = ple_env
    args.test_env = ple_test_env
    args.logdir = output_dir
    args.units_per_hlayer=(args.units_shared_layer1,
                           args.units_shared_layer2,
                           args.units_policy_layer)
    args.policy = policy_fn

    args.total_timesteps = 200000

    meta_learn(**args.__dict__)
    ple_env.close()
示例#6
0
def main():
    parser = arg_parser()
    parser.add_argument('--early_stop',
                        help='stop bad performing runs ealier',
                        type=bool,
                        default=False)
    parser.add_argument('--gamma',
                        help='Discount factor for discounting the reward',
                        type=float,
                        default=0.90)
    parser.add_argument('--epsilon',
                        help='Epsilon for epsilon-greedy policy',
                        type=float,
                        default=0.5)
    parser.add_argument('--epsilon_decay',
                        help='Epsilon decay rate',
                        type=float,
                        default=0.995)
    parser.add_argument('--tau',
                        help='Update rate of target netowrk',
                        type=float,
                        default=0.99)
    parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4)
    parser.add_argument('--buffer_size',
                        help='Replay buffer size',
                        type=float,
                        default=500)
    parser.add_argument(
        '--batch_size',
        help=
        'Batch size. Number of samples drawn from buffer, which are used to update the model.',
        type=int,
        default=50)
    parser.add_argument(
        '--trace_length',
        help='Length of the traces obtained from the batched episodes',
        type=int,
        default=1)
    parser.add_argument('--units_layer1',
                        help='Units in first hidden layer',
                        type=int,
                        default=64)
    parser.add_argument('--units_layer2',
                        help='Units in second hidden layer',
                        type=int,
                        default=64)
    parser.add_argument('--units_layer3',
                        help='Units in third hidden layer',
                        type=int,
                        default=64)
    parser.add_argument(
        '--activ_fcn',
        choices=['relu6', 'elu', 'mixed'],
        type=str,
        default='relu6',
        help='Activation functions of network layers',
    )
    parser.add_argument(
        '--update_interval',
        type=int,
        default=30,
        help=
        'Frequency with which the network model is updated based on minibatch data.'
    )
    args = parser.parse_args()

    assert (args.buffer_size > (args.batch_size * args.trace_length)
            ), 'Batch size needs to be smaller than Buffer size!'

    seed = args.seed
    env = make_ple_env(args.env, seed=seed - 1)
    # env = make_ple_env('ContFlappyBird-hNS-nrf2-test-v0', seed=seed-1)
    test_env = make_ple_env(args.test_env, seed=100 + (seed - 1))

    if args.architecture == 'ff':
        q_network = FF_DQN
        args.trace_length = 1
    elif args.architecture == 'lstm':
        q_network = LSTM_DQN
    elif args.architecture == 'gru':
        q_network = GRU_DQN

    # logdir = os.path.join(args.logdir, str(datetime.datetime.today()))
    # os.makedirs(logdir)

    dqn_output_dir = os.path.join(args.logdir, ('dqn_output' + str(args.seed)))
    if not os.path.isdir(dqn_output_dir):
        os.makedirs(dqn_output_dir)

    # store hyperparms setting
    with open(os.path.join(dqn_output_dir, 'hyperparams.txt'), 'a') as f:
        for k, v in vars(args).items():
            f.write(k + ': ' + str(v) + '\n')

    logger = logging.getLogger(
    )  # setup root logger is necessary to use FIleHandler
    fh = logging.FileHandler(os.path.join(dqn_output_dir, 'algo.log'))
    fh.setLevel(logging.INFO)
    fh.setFormatter(
        logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s'))
    logger.addHandler(fh)
    logger.setLevel(logging.INFO)
    logger.propagate = False

    # If buffer size of the experience replay buffer is smaller than the batch_size * trace length, not enough
    # observations are fed to the network to compute the update step and the code throws an error.
    if args.buffer_size < (args.batch_size * args.trace_length):
        logger.info(
            'Experience replay buffer is too small. Should be bigger than batch_size * trace_length = %i * %i'
            % (args.batch_size, args.trace_length))
        # return -3000, 3000, -3000

    early_stopped, _ = q_learning(q_network=q_network,
                                  env=env,
                                  test_env=test_env,
                                  seed=seed,
                                  total_timesteps=args.total_timesteps,
                                  log_interval=args.log_interval,
                                  test_interval=args.test_interval,
                                  show_interval=args.show_interval,
                                  logdir=dqn_output_dir,
                                  lr=args.lr,
                                  max_grad_norm=args.max_grad_norm,
                                  units_per_hlayer=(args.units_layer1,
                                                    args.units_layer2,
                                                    args.units_layer3),
                                  activ_fcn=args.activ_fcn,
                                  gamma=args.gamma,
                                  epsilon=args.epsilon,
                                  epsilon_decay=args.epsilon_decay,
                                  buffer_size=args.buffer_size,
                                  batch_size=args.batch_size,
                                  trace_length=args.trace_length,
                                  tau=args.tau,
                                  update_interval=args.update_interval,
                                  early_stop=args.early_stop,
                                  keep_model=args.keep_model)
    env.close()

    args.logdir = dqn_output_dir