def train(env_id, num_timesteps, seed): import mlp_policy, pposgd_simple U.make_session(num_cpu=1, num_gpu=0).__enter__() env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=128, num_hid_layers=2) env.seed(seed) pposgd_simple.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=1e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='constant', ) env.close()
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument('--num-timesteps', type=int, default=int(1e12)) parser.add_argument('--num_env', type=int, default=32) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='cnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=2.) parser.add_argument('--dynamics_bonus', type=int, default=0) args = parser.parse_args() logger.configure(dir=logger.get_dir(), format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict( frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args.update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args.update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args.proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus = args.dynamics_bonus ) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps)
def load(path, num_cpus=1): with open(path, "rb") as f: model_data, act_params = dill.load(f) act = build_act(**act_params) sess = U.make_session(num_cpus=num_cpus) sess.__enter__() with tempfile.TemporaryDirectory() as td: filepath = os.path.join(td, "packed.zip") with open(filepath, "wb") as f: f.write(model_data) zipfile.ZipFile(filepath, 'r', zipfile.ZIP_DEFLATED).extractall(td) U.load_state(os.path.join(td, "model")) return ActWrapper(act, act_params)
def run(): import mlp_policy_robo U.make_session(num_cpu=1).__enter__() env = gym.make("RoboschoolHumanoid-v1") #env = wrappers.Monitor(env, directory="./video/HalfCheeta-v1", force=True) def policy_fn(name, ob_space, ac_space): return mlp_policy_robo.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=128, num_hid_layers=2) ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) oldpi = policy_fn("oldpi", ob_space, ac_space) U.load_state("save/Humanoid-v1") for epi in range(100): ob = env.reset() total_reward = 0 step = 0 while True: env.render("human") ac, v = pi.act(True, ob) ob, rew, new, info = env.step(ac) step += 1 total_reward += rew if new: print("Reward: {}, Step: {}".format(total_reward, step)) break
def bernstein_error_partition_cuda( nn, f, degree_bound, input_box, output_index, activation, filename, ): global step step += 1 import error_bound eps = error_bound.error_bound input_dim = len(degree_bound) lips, network_output_range = lipschitz(nn, input_box, output_index, activation) distance_estimate = 0 for idxState in range(input_dim): diff = np.diff(input_box[idxState])[0] if diff > distance_estimate: distance_estimate = diff LD_estimate = lips * distance_estimate * np.sqrt(input_dim) num_partition = int(np.ceil(LD_estimate // eps + 1)) partition = [num_partition] * input_dim print('---------------' + filename + '-------------------') print('step: {}'.format(step)) print('degree bound: {}'.format(degree_bound)) print('number of partition: {}'.format(num_partition)) print('Lipschitz constant: {}'.format(lips)) all_comb_lists = sample_points_list(partition, input_dim) if isinstance(lips, np.ndarray): lips = lips[0] sample_times = (num_partition + 1)**input_dim large_sample_times = False if sample_times < 1e7: all_sample_points = np.zeros( ((num_partition + 1)**input_dim, input_dim), dtype=np.float32) all_shift_points = np.zeros( ((num_partition + 1)**input_dim, input_dim), dtype=np.float32) else: large_sample_times = True os.system('rm ./cach.hdf5') hdf5_store = h5py.File('./cach.hdf5', 'a') all_sample_points = hdf5_store.create_dataset( "all_sample_points", (sample_times, input_dim), compression='gzip') all_shift_points = hdf5_store.create_dataset("all_shift_points", (sample_times, input_dim), compression='gzip') partition_box = np.zeros(input_dim, dtype=np.float64) for j in range(input_dim): alpha_j = np.float64(input_box[j][0]) beta_j = np.float64(input_box[j][1]) partition_box[j] = (beta_j - alpha_j) / num_partition for idxState in range(input_dim): alpha_j = np.float64(input_box[idxState][0]) beta_j = np.float64(input_box[idxState][1]) all_sample_points[:, idxState] = ( (beta_j - alpha_j) * (points_list(all_comb_lists, idxState) / num_partition) + alpha_j) all_shift_points = point_shift_all(all_sample_points, input_box, large_sample_times, all_shift_points) if large_sample_times: hdf5_store.close() order_list, coeffs_list = nn_poly_approx_bernstein_cuda( f, degree_bound, input_box, output_index) poly = polyval(order_list, degree_bound, coeffs_list, 'test') if large_sample_times: with h5py.File('./cach.hdf5', 'r') as hdf5_store: all_sample_points = hdf5_store['all_sample_points'][:] all_shift_points = hdf5_store['all_shift_points'][:] if filename[:4] == 'nn_5' or filename[:4] == 'nn_2': batch_size = 1e5 else: batch_size = 1e7 batch_num = math.ceil(all_sample_points.shape[0] / batch_size) batch_idx = np.arange(1, batch_num) * batch_size batch_idx = batch_idx.astype(int) all_sample_points_batches = np.split(all_sample_points, batch_idx, axis=0) all_shift_points_batches = np.split(all_shift_points, batch_idx, axis=0) poly_results = np.zeros((all_sample_points.shape[0], 1)) nn_results = np.zeros((all_sample_points.shape[0], 1)) with U.make_session() as sess: sess.run(tf.global_variables_initializer()) batch_pointer = 0 print('number of sampling points: {}'.format( all_sample_points.shape[0])) for sample_points, shift_points in zip(all_sample_points_batches, all_shift_points_batches): batch_range = range(batch_pointer, batch_pointer + sample_points.shape[0]) print('batch_range: {}'.format(batch_range)) poly_results[batch_range, :] = poly(sess, shift_points) nn_results[batch_range, :] = nn(sess, sample_points) batch_pointer += sample_points.shape[0] sample_error = np.max(np.absolute(poly_results[:, 0] - nn_results[:, 0])) error = sample_error + lips * LA.norm(partition_box) print('bp to nn error: {}'.format(error)) return error
def main(): parser = arg_parser() parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--max_episode_steps', type=int, default=4500) parser.add_argument('--num-timesteps', type=int, default=int(1e8)) parser.add_argument('--num_env', type=int, default=128) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='cnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=0.) parser.add_argument('--beta', type=float, default=1e-3) parser.add_argument('--exploration_type', type=str, default='bottleneck') parser.add_argument('--noise_type', type=str, default='none', choices=['none', 'box']) parser.add_argument('--noise_p', type=float, default=0.1) parser.add_argument('--use_sched', type=int, default=0) parser.add_argument('--exp_name', type=str, default='none') args = parser.parse_args() if args.policy == 'rnn': args.gamma_ext = 0.999 else: args.gamma_ext = 0.99 logger_dir = './results/' + args.env.replace("NoFrameskip-v4", "") logger_dir += datetime.datetime.now().strftime("-%m-%d-%H-%M-%S") logger.configure(dir=logger_dir, format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict( frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, exploration_type=args.exploration_type, beta=args.beta, noise_type=args.noise_type, noise_p=args.noise_p, use_sched=args.use_sched, exp_name=args.exp_name, ) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps)
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument( "--num-timesteps", type=int, default=int(1e12), ) parser.add_argument( "--num_env", type=int, default=32, ) parser.add_argument( "--use_news", type=int, default=0, ) parser.add_argument( "--gamma", type=float, default=0.99, ) parser.add_argument( "--gamma_ext", type=float, default=0.999, ) parser.add_argument( "--lam", type=float, default=0.95, ) parser.add_argument( "--update_ob_stats_every_step", type=int, default=0, ) parser.add_argument( "--update_ob_stats_independently_per_gpu", type=int, default=0, ) parser.add_argument( "--update_ob_stats_from_random_agent", type=int, default=1, ) parser.add_argument( "--proportion_of_exp_used_for_predictor_update", type=float, default=1.0, ) parser.add_argument( "--tag", type=str, default="", ) parser.add_argument( "--policy", type=str, default="cnn", choices=["cnn", "rnn", "ffnn"], ) parser.add_argument( "--int_coeff", type=float, default=1.0, ) parser.add_argument( "--ext_coeff", type=float, default=2.0, ) parser.add_argument( "--dynamics_bonus", type=int, default=0, ) parser.add_argument( "--meta_rl", type=lambda x: True if x.lower() in {'true', 't'} else False, default=False, ) args = parser.parse_args() logger.configure( dir=logger.get_dir(), format_strs=["stdout", "log", "csv"] if MPI.COMM_WORLD.Get_rank() == 0 else [], ) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), "experiment_tag.txt"), "w") as f: f.write(args.tag) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict( frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus, meta_rl=args.meta_rl, ) tf_util.make_session(make_default=True) train( env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps, )
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument('--num_timesteps', type=float, default=100e6) parser.add_argument('--num_env', type=int, default=128) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--gamma_div', type=float, default=0.999) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=1) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_updated', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='cnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=2.) parser.add_argument('--dynamics_bonus', type=int, default=0) parser.add_argument('--save_dir', help="dir to save and log", type=str, default="save_dir") parser.add_argument('--load_path', help="dir to load model", type=str, default=None) parser.add_argument('--base_load_path', help="dir to load model", type=str, default=None) parser.add_argument('--r_path', help="dir to load r network", type=str, default=None) parser.add_argument('--play', default=False, action='store_true') parser.add_argument('--only_train_r', default=False, action='store_true') parser.add_argument('--online_train_r', default=False, action='store_true') #parser.add_argument('--ec_type', type=str, default='episodic_curiosity', choices=['episodic_curiosity', 'none','oracle']) parser.add_argument('--rnd_type', type=str, default='rnd', choices=['rnd', 'oracle']) parser.add_argument('--reset', default=False, action='store_true') parser.add_argument('--dynamics_sample', default=False, action='store_true') parser.add_argument('--num_agents', type=int, default=1) parser.add_argument('--div_type', type=str, default='oracle', choices=['oracle', 'cls', 'rnd']) parser.add_argument('--load_ram', default=False, action='store_true') parser.add_argument('--debug', default=False, action='store_true') parser.add_argument('--rnd_mask_prob', type=float, default=1.) parser.add_argument('--rnd_mask_type', type=str, default='indep', choices=['prog', 'indep', 'shared']) parser.add_argument('--indep_rnd', default=False, action='store_true') parser.add_argument('--indep_policy', default=True, action='store_true') parser.add_argument('--sd_type', type=str, default='oracle', choices=['oracle', 'sd']) parser.add_argument('--from_scratch', default=False, action='store_true') parser.add_argument('--kl', default=False, action='store_true') args = parser.parse_args() log_path = os.path.join(args.save_dir, 'logs') save_path = os.path.join(args.save_dir, 'models') logger.configure(dir=log_path, format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict( frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, gamma_div=args.gamma_div, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_updated, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus, log_interval=10, save_path=save_path, load_path=args.load_path, r_path=args.r_path, play=args.play, only_train_r=args.only_train_r, online_train_r=args.online_train_r, #ec_type = args.ec_type, rnd_type=args.rnd_type, reset=args.reset, dynamics_sample=args.dynamics_sample, num_agents=args.num_agents, div_type=args.div_type, load_ram=args.load_ram, debug=args.debug, rnd_mask_prob=args.rnd_mask_prob, rnd_mask_type=args.rnd_mask_type, indep_rnd=args.indep_rnd, indep_policy=args.indep_policy, sd_type=args.sd_type, from_scratch=args.from_scratch, base_load_path=args.base_load_path, use_kl=args.kl) if args.play: args.num_env = 1 tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps)
def main(): default_log_dir = "/tmp/rnd_log" parser = arg_parser() add_env_params(parser) parser.add_argument('--num-timesteps', type=int, default=int(4.2e7)) # 10k parser.add_argument('--num_env', type=int, default=32) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) # parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.999) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) # parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=0.25) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='cnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=2.) parser.add_argument('--dynamics_bonus', type=int, default=0) parser.add_argument('--logdir', type=str, default=default_log_dir) parser.add_argument('--action_balance_coef', '--abc', type=float, default=None) parser.add_argument('--array_action', type=int, default=1) parser.add_argument('--num_minibatches', type=int, default=4) args = parser.parse_args() if args.logdir != default_log_dir and os.path.isdir( args.logdir) and os.listdir(args.logdir): raise ValueError("logdir not empty!") logger.configure(dir=args.logdir, format_strs=['stdout', 'log', 'csv', 'tensorboard'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict(frame_stack=4, nminibatches=args.num_minibatches, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus, action_balance_coef=args.action_balance_coef, array_action=args.array_action) logger.info('args: {}'.format(args)) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps)
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument('--num-timesteps', type=int, default=int(1e12)) parser.add_argument('--num_env', type=int, default=32) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='rnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=2.) parser.add_argument('--dynamics_bonus', type=int, default=0) parser.add_argument( '--save_dir', type=str, default='/home/hxu/PriorRL/random-network-distillation/ckpts/') parser.add_argument( '--load_dir', type=str, default='/home/hxu/PriorRL/random-network-distillation/ckpts/') parser.add_argument('--test', type=int, default=0) parser.add_argument('--save_image', type=int, default=0) parser.add_argument('--exp_name', type=str, default='tmp') parser.add_argument('--logdir', type=str, default='./logs/') parser.add_argument('--clip_rewards', type=int, default=1) parser.add_argument('--e_greedy', type=int, default=0) parser.add_argument('--action_space', type=str, default='RIGHT_ONLY') parser.add_argument('--load_mtype', type=str, default='latest') args = parser.parse_args() logdir = os.path.join( args.logdir, args.exp_name + '_' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")) logger.configure(folder=logdir, format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict(frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps, load_dir=args.load_dir, save_dir=args.save_dir, test=args.test, exp_name=args.exp_name, clip_rewards=args.clip_rewards, save_image=args.save_image, action_space=args.action_space, e_greedy=args.e_greedy, load_mtype=args.load_mtype)
def learn(env, q_func, alpha=1e-5, num_cpu=1, n_steps=100000, update_target_every=500, train_main_every=1, print_every=50, checkpoint_every=10000, buffer_size=50000, gamma=1.0, batch_size=32, param_noise=False, pre_run_steps=1000, exploration_fraction=0.1, final_epsilon=0.1, callback=None): """ :param env: gym.Env, environment from OpenAI :param q_func: (tf.Variable, int, str, bool) -> tf.Variable the q function takes the following inputs: input_ph: tf.placeholder, network input n_actions: int, number of possible actions scope: str, specifying the variable scope reuse: bool, whether to reuse the variable given in `scope` :param alpha: learning rate :param num_cpu: number of cpu to use :param n_steps: number of training steps :param update_target_every: frequency to update the target network :param train_main_every: frequency to update(train) the main network :param print_every: how often to print message to console :param checkpoint_every: how often to save the model. :param buffer_size: size of the replay buffer :param gamma: int, discount factor :param batch_size: int, size of the input batch :param param_noise: bool, whether to use parameter noise :param pre_run_steps: bool, pre-run steps to fill in the replay buffer. And only after `pre_run_steps` steps, will the main and target network begin to update. :param exploration_fraction: float, between 0 and 1. Fraction of the `n_steps` to linearly decrease the epsilon. After that, the epsilon will remain unchanged. :param final_epsilon: float, final epsilon value, usually a very small number towards zero. :param callback: (dict, dict) -> bool a function to decide whether it's time to stop training, takes following inputs: local_vars: dict, the local variables in the current scope global_vars: dict, the global variables in the current scope :return: ActWrapper, a callable function """ n_actions = env.action_space.n sess = U.make_session(num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) act, train, update_target, debug = build_train( make_obs_ph, q_func, n_actions, optimizer=tf.train.AdamOptimizer(alpha), gamma=gamma, param_noise=param_noise, grad_norm_clipping=10) act_params = { "q_func": q_func, "n_actions": env.action_space.n, "make_obs_ph": make_obs_ph, } buffer = ReplayBuffer(buffer_size) exploration = LinearSchedule(schedule_steps=int(exploration_fraction * n_steps), final_p=final_epsilon, initial_p=1.0) # writer = tf.summary.FileWriter("./log", sess.graph) U.initialize() # writer.close() update_target() # copy from the main network episode_rewards = [] current_episode_reward = 0.0 model_saved = False saved_mean_reward = 0.0 obs_t = env.reset() with tempfile.TemporaryDirectory() as td: model_file_path = os.path.join(td, "model") for step in range(n_steps): if callback is not None: if callback(locals(), globals()): break kwargs = {} if not param_noise: epsilon = exploration.value(step) else: assert False, "Not implemented" action = act(np.array(obs_t)[None], epsilon=epsilon, **kwargs)[0] obs_tp1, reward, done, _ = env.step(action) current_episode_reward += reward buffer.add(obs_t, action, reward, obs_tp1, done) obs_t = obs_tp1 if done: obs_t = env.reset() episode_rewards.append(current_episode_reward) current_episode_reward = 0.0 # given sometime to fill in the buffer if step < pre_run_steps: continue # q_value = debug["q_values"] # if step % 1000 == 0: # print(q_value(np.array(obs_t)[None])) if step % train_main_every == 0: obs_ts, actions, rewards, obs_tp1s, dones = buffer.sample( batch_size) weights = np.ones_like(dones) td_error = train(obs_ts, actions, rewards, obs_tp1s, dones, weights) if step % update_target_every == 0: update_target() mean_100eps_reward = float(np.mean(episode_rewards[-101:-1])) if done and print_every is not None and len( episode_rewards) % print_every == 0: print( "step %d, episode %d, epsilon %.2f, running mean reward %.2f" % (step, len(episode_rewards), epsilon, mean_100eps_reward)) if checkpoint_every is not None and step % checkpoint_every == 0: if saved_mean_reward is None or mean_100eps_reward > saved_mean_reward: U.save_state(model_file_path) model_saved = True if print_every is not None: print( "Dump model to file due to mean reward increase: %.2f -> %.2f" % (saved_mean_reward, mean_100eps_reward)) saved_mean_reward = mean_100eps_reward if model_saved: U.load_state(model_file_path) if print_every: print("Restore model from file with mean reward %.2f" % (saved_mean_reward, )) return ActWrapper(act, act_params)
def bernstein_error_partition_cuda(f_details, f, d, box, output_index, activation, filename, eps=1e-2): if filename == 'nn_12_relu': eps = 1e-2 elif filename == 'nn_12_sigmoid': eps = 1e-2 elif filename == 'nn_12_tanh': eps = 1e-2 elif filename == 'nn_12_relu_tanh': eps = 1e-3 elif filename == 'nn_13_relu': eps = 1e-3 elif filename == 'nn_13_sigmoid': eps = 5e-4 elif filename == 'nn_13_tanh': eps = 1e-2 elif filename == 'nn_13_relu_tanh': eps = 1e-2 elif filename == 'nn_13_relu_tanh_1': eps = 1e-2 elif filename == 'nn_13_relu_tanh_100': eps = 1e-2 elif filename == 'nn_13_relu_tanh_origin': eps = 1e-2 elif filename == 'nn_14_relu': eps = 1e-2 elif filename == 'nn_14_sigmoid': eps = 5e-3 elif filename == 'nn_14_tanh': eps = 1e-2 elif filename == 'nn_14_relu_sigmoid': eps = 5e-3 elif filename == 'nn_tora_relu_retrained': eps = 1e-2 elif filename == 'nn_tora_tanh': eps = 2e-2 elif filename == 'nn_tora_relu_tanh': eps = 1e-2 elif filename == 'nn_tora_sigmoid': eps = 1e-2 elif filename == 'nn_16_relu': eps = 5e-3 elif filename == 'nn_16_sigmoid': eps = 1e-2 elif filename == 'nn_16_tanh': eps = 1e-2 elif filename == 'nn_16_relu_tanh': eps = 1e-2 elif filename == 'nn_18_relu': eps = 4e-3 elif filename == 'nn_18_relu_tanh': eps = 4e-3 elif filename == 'nn_18_sigmoid': eps = 4e-3 elif filename == 'nn_18_tanh_new': eps = 4e-3 m = len(d) lips, network_output_range = lipschitz(f_details, box, output_index, activation) distance_estimate = 0 for j in range(m): diff = np.diff(box[j])[0] if diff > distance_estimate: distance_estimate = diff LD_estimate = lips * distance_estimate * np.sqrt(m) num_partition = int(np.ceil(LD_estimate // eps + 1)) partition = [num_partition]*m all_comb_lists = degree_comb_lists(partition, m) if isinstance(lips, np.ndarray): lips = lips[0] all_sample_points = np.zeros((len(all_comb_lists),m), dtype=np.float64) all_shift_points = np.zeros((len(all_comb_lists),m), dtype=np.float64) partition_box = np.zeros(m, dtype=np.float64) for j in range(m): alpha_j = np.float64(box[j][0]) beta_j = np.float64(box[j][1]) partition_box[j] = (beta_j - alpha_j) / num_partition all_comb_lists = np.array(all_comb_lists) for idxState in range(m): alpha_j = np.float64(box[idxState][0]) beta_j = np.float64(box[idxState][0]) all_sample_points[:, idxState] = ( (beta_j - alpha_j) * (all_comb_lists[:, idxState]/num_partition) + alpha_j ) all_shift_points = point_shift_all(all_sample_points, box) degree_list, coef_list = nn_poly_approx_bernstein_cuda(f, d, box, output_index) poly = polyval(degree_list, d, coef_list, 'test') with U.make_session() as sess: sess.run(tf.global_variables_initializer()) poly_results = poly(sess, all_shift_points) nn_results = f_details(sess, all_sample_points) # nn_results = np.zeros(len(all_sample_points), dtype=np.float64) # for index in range(all_sample_points.shape[0]): # point = all_sample_points[index,:] # nn_results[index] = f(point)[output_index] sample_error = np.max(np.absolute(poly_results[:,0] - nn_results[:,0])) # max_index = np.argmax(np.absolute(poly_results - nn_results)) # print(max_index) # print(all_sample_points[max_index, :]) # print(nn_results[max_index]) # print(all_shift_points[max_index, :]) # print(poly_results[max_index]) error = sample_error + lips * LA.norm(partition_box) return error
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, callback=None): """Train a deepq model. Parameters ------- env : gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput([84, 84], name=name) act, train, update_target, debug = build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=2, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': 2, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.step(0) with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: #obs = env.reset() episode_rewards.append(0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: #logger.record_tabular("steps", t) #logger.record_tabular("episodes", num_episodes) #logger.record_tabular("mean 100 episode reward", mean_100ep_reward) #logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) #logger.dump_tabular() print("steps: {}".format(t)) print("episodes: {}".format(num_episodes)) print("mean 100 episode reward: {}".format(mean_100ep_reward)) print("% time spent exploring: {}".format( int(100 * exploration.value(t)))) if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: #if print_freq is not None: #logger.log("Saving model due to mean reward increase: {} -> {}".format( # saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: #if print_freq is not None: #logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) U.load_state(model_file) return ActWrapper(act, act_params)
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument('--num-timesteps', type=int, default=int(1e7)) parser.add_argument('--num_env', type=int, default=16) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='rnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=0) parser.add_argument('--dynamics_bonus', type=int, default=0) parser.add_argument('--clear-run', action='store_true', default=False, help='if clear the save folder') parser.add_argument('--mega-wrapper', type=int, default=0, help='if use the same wrapper as mega') args = parser.parse_args() args.save_dir = '../rnd_results/' args.save_dir = os.path.join(args.save_dir, 'e_n-{}/'.format(args.env)) args.save_dir = os.path.join( args.save_dir, 'mega_wrapper-{}'.format(str(args.mega_wrapper))) args.save_dir = os.path.join(args.save_dir, 'num_env-{}'.format(str(args.num_env))) args.save_dir = os.path.join(args.save_dir, 'int_coeff-{}'.format(str(args.int_coeff))) if args.clear_run: '''if clear_run, clear the path before create the path''' input('You have set clear_run, is that what you want?') subprocess.call(["rm", "-r", args.save_dir]) try: os.makedirs(args.save_dir) except Exception as e: print('file exists') try: os.makedirs('../rnd_log_results/' + args.env + '/') except Exception as e: print('log file exists') args.summary_writer = tf.summary.FileWriter(args.save_dir) logger.configure(dir='../rnd_log_results/' + args.env + '/', format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict(frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps, args=args)
def main(): import neptune parser = argparse.ArgumentParser(argument_default=None) parser.add_argument('--config', action='append', help='Gin config files.') parser.add_argument('--debug', action='store_true', default=False) cmd_args, unknown = parser.parse_known_args() debug = cmd_args.debug spec_path = cmd_args.config[0] if not debug: try: with open(spec_path, 'rb') as f: import cloudpickle specification = cloudpickle.load(f) except pickle.UnpicklingError: with open(spec_path) as f: vars_ = {'script': os.path.basename(spec_path)} exec(f.read(), vars_) # pylint: disable=exec-used specification = vars_['experiments_list'][0].to_dict() print( 'NOTE: Only the first experiment from the list will be run!' ) parameters = specification['parameters'] else: print("debug run") parameters = dict(env_id="toy_mr", env_size=None) class MockArgs(object): def add(self, key, value): setattr(self, key, value) args = MockArgs() args.add('env', parameters["env_id"]) # 'chain_env' 'toy_mr' args.add('env_size', parameters["env_size"]) args.add('seed', 0) args.add('max_episode_steps', 300) args.add('num_timesteps', int(1e12)) args.add('num_env', 32) args.add('use_news', 0) args.add('gamma', 0.99) args.add('gamma_ext', 0.999) args.add('lam', 0.95) args.add('update_ob_stats_every_step', 0) args.add('update_ob_stats_independently_per_gpu', 0) args.add('update_ob_stats_from_random_agent', 1) args.add('proportion_of_exp_used_for_predictor_update', 1.) args.add('tag', '') args.add( 'policy', 'cnn', ) args.add('int_coeff', 1.) args.add('ext_coeff', 2.) args.add('dynamics_bonus', 0) if not debug: # TODO read more from specification print("running with neptune") neptune.init( project_qualified_name="pmtest/planning-with-learned-models") neptune.create_experiment( name=specification['name'], tags=specification['tags'], params=specification['parameters'], upload_stdout=False, upload_stderr=False, ) neptune.send_metric("test", 777) baselines_format_strs = ['log', 'csv'] else: print("running without neptune") baselines_format_strs = ['stdout', 'log', 'csv'] logger.configure(dir="out", format_strs=baselines_format_strs) seed = 10000 * args.seed # + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict(frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, env_size=args.env_size, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps, use_neptune=(not debug))