def dqn_arg_parser(): parser = arg_parser() parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90) parser.add_argument('--tau', help='Update rate of target network', type=float, default=0.99) parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4) parser.add_argument('--lrschedule', help='Learning Rate Decay Schedule', choices=['constant', 'linear', 'double_linear_con'], default='constant') parser.add_argument( '--nbatch', help= 'Batch size. Number of sampless drawn from buffer, which are used to update the model.', type=int, default=3) parser.add_argument('--buffer_size', help='Replay buffer size', type=int, default=5000) parser.add_argument( '--trace_length', help='Length of the traces obtained from the batched episodes', type=int, default=8) parser.add_argument( '--max_grad_norm', help='Maximum gradient norm up to which gradient is not clipped', type=float, default=0.01) parser.add_argument( '--update_interval', type=int, default=5, help= 'Frequency with which the network model is updated based on minibatch data.' ) return parser.parse_args()
def main(): parser = arg_parser() # parser = arg_parser() parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90) parser.add_argument('--epsilon', help='Epsilon for epsilon-greedy policy', type=float, default=0.5) parser.add_argument('--epsilon_decay', help='Epsilon decay rate', type=float, default=0.995) parser.add_argument('--tau', help='Update rate of target netowrk', type=float, default=0.99) parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4) parser.add_argument('--lrschedule', help='Learning Rate Decay Schedule', choices=['constant', 'linear', 'double_linear_con'], default='constant') parser.add_argument( '--nbatch', help= 'Batch size. Number of sampless drawn from buffer, which are used to update the model.', type=int, default=3) parser.add_argument('--buffer_size', help='Replay buffer size', type=int, default=10) parser.add_argument( '--trace_length', help='Length of the traces obtained from the batched episodes', type=int, default=8) parser.add_argument( '--max_grad_norm', help='Maximum gradient norm up to which gradient is not clipped', type=float, default=0.01) parser.add_argument('--units_layer1', help='Units in first hidden layer', type=int, default=64) parser.add_argument('--units_layer2', help='Units in second hidden layer', type=int, default=64) parser.add_argument('--units_layer3', help='Units in third hidden layer', type=int, default=64) parser.add_argument( '--update_interval', type=int, default=5, help= 'Frequency with which the network model is updated based on minibatch data.' ) # parser.add_argument('--log_interval', help='parameter values stored in tensorboard summary every <log_interval> model update step. 0 --> no logging ', type=int, default=30) # parser.add_argument('--show_interval', help='Env is rendered every n-th episode. 0 = no rendering', type=int, default=30) # parser.add_argument('--logdir', help='directory where logs are stored', default='/home/mara/Desktop/logs/A2C_OAI_NENVS') # '/mnt/logs/A2C') args = parser.parse_args() seed = args.seed env = make_ple_env(args.env, seed=seed) test_env = make_ple_env(args.env, seed=seed) # logdir = os.path.join(args.logdir, str(datetime.datetime.today())) # os.makedirs(logdir) dqn_output_dir = os.path.join(args.logdir, ('dqn_rnn_output' + str(args.seed))) if not os.path.isdir(dqn_output_dir): # TODO check what this does os.makedirs(dqn_output_dir) # store hyperparms setting with open(os.path.join(dqn_output_dir, 'hyperparams.txt'), 'a') as f: for k, v in vars(args).items(): f.write(k + ': ' + str(v) + '\n') logger = logging.getLogger( ) # TODO setup root logger is necessary to use FIleHandler logger.propagate = False fh = logging.FileHandler(os.path.join(dqn_output_dir, 'algo.log')) fh.setLevel(logging.INFO) fh.setFormatter( logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s')) logger.addHandler(fh) logger.setLevel(logging.INFO) q_learning(env, test_env=test_env, seed=seed, total_timesteps=args.total_timesteps, gamma=args.gamma, epsilon=args.epsilon, epsilon_decay=args.epsilon_decay, tau=args.tau, lr=args.lr, lrschedule=args.lrschedule, buffer_size=args.buffer_size, nbatch=args.nbatch, trace_length=args.trace_length, max_grad_norm=args.max_grad_norm, units_per_hlayer=(args.units_layer1, args.units_layer2, args.units_layer3), update_interval=args.update_interval, log_interval=args.log_interval, test_interval=args.test_interval, show_interval=args.show_interval, logdir=dqn_output_dir, keep_model=args.keep_model) env.close() args.logdir = dqn_output_dir avg_perf, var_perf, max_return = eval_model(render=False, nepisodes=15, **args.__dict__) with open(os.path.join(args.logdir, 'hyperparams.txt'), 'a') as f: f.write('\n') f.write('Results: \n') f.write('average performance: ' + str(avg_perf) + '\n') f.write('performance variance: ' + str(var_perf) + '\n') f.write('maximum return: ' + str(max_return) + '\n')
def main(): parser = arg_parser() parser.add_argument('--early_stop', help='stop bad performing runs ealier', type=bool, default=False) parser.add_argument('--nenvs', help='Number of parallel simulation environmenrs', type=int, default=4) parser.add_argument('--activ_fcn', choices=['relu6', 'elu', 'mixed'], type=str, default='elu', help='Activation functions of network layers', ) parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4) parser.add_argument('--nsteps', type=int, default=32, help='number of samples based on which gradient is updated') parser.add_argument('--nminibatches', help='Number of minibatches per sampled data batch.', type=int, default=1) parser.add_argument('--noptepochs', help='Number of optimization epochs with sample data, i.e. how often samples are reused.', type=int, default=1) parser.add_argument('--lam', help='Lambda parameter for GAE', type=float, default=0.95) parser.add_argument('--cliprange', help='Defines the maximum policy change allowed, before clipping.', type=float, default=0.2) parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90) parser.add_argument('--vf_coeff', help='Weight of value function loss in total loss', type=float, default=0.2) parser.add_argument('--ent_coeff', help='Weight of entropy in total loss', type=float, default=1e-7) parser.add_argument('--units_shared_layer1', help='Units in first hidden layer which is shared', type=int, default=64) parser.add_argument('--units_shared_layer2', help='Units in second hidden layer which is shared', type=int, default=64) parser.add_argument('--units_policy_layer', help='Units in hidden layer in policy head', type=int, default=64) parser.add_argument('--restore_model', help='whether a pretrained model shall be restored', type=bool, default=False) args = parser.parse_args() seed = args.seed env = make_ple_envs(args.env, num_env=args.nenvs, seed=seed*10) # env = make_ple_envs('ContFlappyBird-hNS-nrf2-train-v0', num_env=args.nenvs, seed=seed - 1) test_env = make_ple_env(args.test_env, seed=3000) if args.architecture == 'ff': policy_fn = LargerMLPPolicy elif args.architecture == 'lstm': policy_fn = LargerLSTMPolicy elif args.architecture == 'gru': policy_fn = GRUPolicy else: print('Policy option %s is not implemented yet.' % args.policy) # store hyperparms setting # logdir = os.path.join(args.logdir, str(datetime.datetime.today())) # os.makedirs(logdir) ppo_output_dir = os.path.join(args.logdir, ('ppo_output'+str(args.seed))) if not os.path.isdir(ppo_output_dir): os.makedirs(ppo_output_dir) with open(os.path.join(ppo_output_dir, 'hyperparams.txt'), 'a') as f: for k,v in vars(args).items(): f.write(k + ': ' + str(v) + '\n') logger = logging.getLogger() fh = logging.FileHandler(os.path.join(ppo_output_dir, 'algo.log')) fh.setLevel(logging.INFO) fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s')) logger.addHandler(fh) logger.setLevel(logging.INFO) logger.propagate = False early_stopped = learn(policy_fn, env=env, test_env=test_env, seed=seed, total_timesteps=args.total_timesteps, log_interval=args.log_interval, test_interval=args.test_interval, show_interval=args.show_interval, logdir=ppo_output_dir, lr=args.lr, # lrschedule=args.lrschedule, max_grad_norm=args.max_grad_norm, units_per_hlayer=(args.units_shared_layer1, args.units_shared_layer2, args.units_policy_layer), activ_fcn=args.activ_fcn, gamma=args.gamma, vf_coef=args.vf_coeff, ent_coef=args.ent_coeff, nsteps=args.nsteps, lam=args.lam, nminibatches=args.nminibatches, noptepochs=args.noptepochs, cliprange=args.cliprange, early_stop=args.early_stop, keep_model=args.keep_model, restore_model=args.restore_model) env.close()
def main(): parser = arg_parser() parser.add_argument('--early_stop', help='stop bad performing runs ealier', type=bool, default=False) parser.add_argument('--nenvs', help='Number of parallel simulation environmenrs', type=int, default=1) parser.add_argument( '--activ_fcn', choices=['relu6', 'elu', 'mixed'], type=str, default='relu6', help='Activation functions of network layers', ) parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4) parser.add_argument( '--batch_size', type=int, default=50, help='number of samples based on which gradient is updated', ) parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90) parser.add_argument('--vf_coeff', help='Weight of value function loss in total loss', type=float, default=0.2) parser.add_argument('--ent_coeff', help='Weight of entropy in total loss', type=float, default=1e-7) parser.add_argument('--units_shared_layer1', help='Units in first hidden layer which is shared', type=int, default=64) parser.add_argument('--units_shared_layer2', help='Units in second hidden layer which is shared', type=int, default=64) parser.add_argument('--units_policy_layer', help='Units in hidden layer in policy head', type=int, default=64) args = parser.parse_args() seed = args.seed env = make_ple_envs(args.env, num_env=args.nenvs, seed=seed - 1) test_env = make_ple_env(args.test_env, seed=100 + (seed - 1)) if args.architecture == 'ff': policy_fn = MLPPolicy elif args.architecture == 'lstm': policy_fn = LSTMPolicy elif args.architecture == 'gru': policy_fn = GRUPolicy else: print('Policy option %s is not implemented yet.' % args.policy) a2c_output_dir = os.path.join(args.logdir, ('a2c_output' + str(args.seed))) if not os.path.isdir(a2c_output_dir): os.makedirs(a2c_output_dir) with open(os.path.join(a2c_output_dir, 'hyperparams.txt'), 'a') as f: for k, v in vars(args).items(): f.write(k + ': ' + str(v) + '\n') logger = logging.getLogger() fh = logging.FileHandler(os.path.join(a2c_output_dir, 'algo.log')) fh.setLevel(logging.INFO) fh.setFormatter( logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s')) logger.addHandler(fh) logger.setLevel(logging.INFO) logger.propagate = False early_stopped = learn( policy_fn, env=env, test_env=test_env, seed=seed, total_timesteps=args.total_timesteps, log_interval=args.log_interval, test_interval=args.test_interval, show_interval=args.show_interval, logdir=a2c_output_dir, lr=args.lr, # lrschedule=args.lrschedule, max_grad_norm=args.max_grad_norm, units_per_hlayer=(args.units_shared_layer1, args.units_shared_layer2, args.units_policy_layer), activ_fcn=args.activ_fcn, gamma=args.gamma, vf_coef=args.vf_coeff, ent_coef=args.ent_coeff, batch_size=args.batch_size, early_stop=args.early_stop, keep_model=args.keep_model) env.close()
def main(): parser = arg_parser() parser.add_argument('--early_stop', help='stop bad performing runs ealier', type=bool, default=False) parser.add_argument('--nenvs', help='Number of parallel simulation environmenrs', type=int, default=1) parser.add_argument('--activ_fcn', choices=['relu6', 'elu', 'mixed'], type=str, default='mixed', help='Activation functions of network layers', ) parser.add_argument('--lr', help='Learning Rate', type=float, default=0.001) parser.add_argument('--nsteps', type=int, default=32, help='number of samples based on which gradient is updated') parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90) parser.add_argument('--vf_coeff', help='Weight of value function loss in total loss', type=float, default=0.2) parser.add_argument('--ent_coeff', help='Weight of entropy in total loss', type=float, default=7e-5) parser.add_argument('--units_shared_layer1', help='Units in first hidden layer which is shared', type=int, default=28) parser.add_argument('--units_shared_layer2', help='Units in second hidden layer which is shared', type=int, default=59) parser.add_argument('--units_policy_layer', help='Units in hidden layer in policy head', type=int, default=21) # PPO args parser.add_argument('--nminibatches', help='Number of minibatches per sampled data batch.', type=int, default=2) parser.add_argument('--noptepochs', help='Number of optimization epochs with sample data, i.e. how often samples are reused.', type=int, default=4) parser.add_argument('--lam', help='Lambda parameter for GAE', type=float, default=0.95) parser.add_argument('--cliprange', help='Defines the maximum policy change allowed, before clipping.', type=float, default=0.2) # MAML args parser.add_argument('--K', help='length of each rollout (=trajectory)', type=int, default=20) # Test how well it works with other measures. parser.add_argument('--train_batchsz', help='number of rollouts per adaptation/training update (=fast update)', type=int, default=1) parser.add_argument('--kshot', help='number of adaptation/training update (=fast updates) per task between two meta updates', type=int, default=1000) parser.add_argument('--test_batchsz', help='number of rollouts with updated model on which test_loss is computed', type=int, default=1) parser.add_argument('--meta_batchsz', help='number of sampled tasks per meta update', type=int, default=4) # parallely or sequentially parser.add_argument('--test_stage', help='whether or not meta learner is in test_stage', type=bool, default=False) parser.add_argument('--base_agent', help='type of base learning agent, i.e. A2C or PPO agent', type=str, default='ppo') args = parser.parse_args() print(args) ple_env = make_ple_envs(args.env, args.nenvs, seed=args.seed-1) ple_test_env = make_ple_env(args.test_env, seed=100 + (args.seed-1)) if args.architecture == 'ff': policy_fn = LargerMLPPolicy elif args.architecture == 'lstm': policy_fn = LargerLSTMPolicy elif args.architecture == 'gru': policy_fn = GRUPolicy else: print('Policy option %s is not implemented yet.' % args.policy) output_dir = os.path.join(args.logdir, ('a2c_output'+str(args.seed))) if not os.path.isdir(output_dir): os.makedirs(output_dir) with open(os.path.join(output_dir, 'hyperparams.txt'), 'a') as f: for k,v in vars(args).items(): f.write(k + ': ' + str(v) + '\n') logger = logging.getLogger() fh = logging.FileHandler(os.path.join(output_dir, 'algo.log')) fh.setLevel(logging.INFO) fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s')) logger.addHandler(fh) logger.setLevel(logging.INFO) logger.propagate = False # if not args.test_stage: # construct training model # pass args.env = ple_env args.test_env = ple_test_env args.logdir = output_dir args.units_per_hlayer=(args.units_shared_layer1, args.units_shared_layer2, args.units_policy_layer) args.policy = policy_fn args.total_timesteps = 200000 meta_learn(**args.__dict__) ple_env.close()
def main(): parser = arg_parser() parser.add_argument('--early_stop', help='stop bad performing runs ealier', type=bool, default=False) parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90) parser.add_argument('--epsilon', help='Epsilon for epsilon-greedy policy', type=float, default=0.5) parser.add_argument('--epsilon_decay', help='Epsilon decay rate', type=float, default=0.995) parser.add_argument('--tau', help='Update rate of target netowrk', type=float, default=0.99) parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4) parser.add_argument('--buffer_size', help='Replay buffer size', type=float, default=500) parser.add_argument( '--batch_size', help= 'Batch size. Number of samples drawn from buffer, which are used to update the model.', type=int, default=50) parser.add_argument( '--trace_length', help='Length of the traces obtained from the batched episodes', type=int, default=1) parser.add_argument('--units_layer1', help='Units in first hidden layer', type=int, default=64) parser.add_argument('--units_layer2', help='Units in second hidden layer', type=int, default=64) parser.add_argument('--units_layer3', help='Units in third hidden layer', type=int, default=64) parser.add_argument( '--activ_fcn', choices=['relu6', 'elu', 'mixed'], type=str, default='relu6', help='Activation functions of network layers', ) parser.add_argument( '--update_interval', type=int, default=30, help= 'Frequency with which the network model is updated based on minibatch data.' ) args = parser.parse_args() assert (args.buffer_size > (args.batch_size * args.trace_length) ), 'Batch size needs to be smaller than Buffer size!' seed = args.seed env = make_ple_env(args.env, seed=seed - 1) # env = make_ple_env('ContFlappyBird-hNS-nrf2-test-v0', seed=seed-1) test_env = make_ple_env(args.test_env, seed=100 + (seed - 1)) if args.architecture == 'ff': q_network = FF_DQN args.trace_length = 1 elif args.architecture == 'lstm': q_network = LSTM_DQN elif args.architecture == 'gru': q_network = GRU_DQN # logdir = os.path.join(args.logdir, str(datetime.datetime.today())) # os.makedirs(logdir) dqn_output_dir = os.path.join(args.logdir, ('dqn_output' + str(args.seed))) if not os.path.isdir(dqn_output_dir): os.makedirs(dqn_output_dir) # store hyperparms setting with open(os.path.join(dqn_output_dir, 'hyperparams.txt'), 'a') as f: for k, v in vars(args).items(): f.write(k + ': ' + str(v) + '\n') logger = logging.getLogger( ) # setup root logger is necessary to use FIleHandler fh = logging.FileHandler(os.path.join(dqn_output_dir, 'algo.log')) fh.setLevel(logging.INFO) fh.setFormatter( logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s')) logger.addHandler(fh) logger.setLevel(logging.INFO) logger.propagate = False # If buffer size of the experience replay buffer is smaller than the batch_size * trace length, not enough # observations are fed to the network to compute the update step and the code throws an error. if args.buffer_size < (args.batch_size * args.trace_length): logger.info( 'Experience replay buffer is too small. Should be bigger than batch_size * trace_length = %i * %i' % (args.batch_size, args.trace_length)) # return -3000, 3000, -3000 early_stopped, _ = q_learning(q_network=q_network, env=env, test_env=test_env, seed=seed, total_timesteps=args.total_timesteps, log_interval=args.log_interval, test_interval=args.test_interval, show_interval=args.show_interval, logdir=dqn_output_dir, lr=args.lr, max_grad_norm=args.max_grad_norm, units_per_hlayer=(args.units_layer1, args.units_layer2, args.units_layer3), activ_fcn=args.activ_fcn, gamma=args.gamma, epsilon=args.epsilon, epsilon_decay=args.epsilon_decay, buffer_size=args.buffer_size, batch_size=args.batch_size, trace_length=args.trace_length, tau=args.tau, update_interval=args.update_interval, early_stop=args.early_stop, keep_model=args.keep_model) env.close() args.logdir = dqn_output_dir