def train(env_id, num_timesteps, seed): from baselines.pposgd import mlp_policy import pposgd_simple sess=U.make_session(num_cpu=1) sess.__enter__() logger.session().__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = bench.Monitor(env, "monitor.json") env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear' ) env.close() # save model saver = tf.train.Saver() saver.save(sess, "model/model.ckpt")
def train(env_id, num_timesteps, seed, num_cpu): from baselines.pposgd import pposgd_simple, cnn_policy import baselines.common.tf_util as U whoami = mpi_fork(num_cpu) if whoami == "parent": return rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() logger.session().__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_train(env) num_timesteps /= 4 # because we're wrapping the envs to do frame skip env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear' ) env.close()
def train(env_id, num_timesteps, seed): from baselines.pposgd import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() logger.session().__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = bench.Monitor(env, osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, ) env.close()
def train(env_id, num_timesteps, seed, num_cpu): from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy from baselines.trpo_mpi import trpo_mpi import baselines.common.tf_util as U whoami = mpi_fork(num_cpu) if whoami == "parent": return rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() logger.session().__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space) env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json"%rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_train(env) num_timesteps /= 4 # because we're wrapping the envs to do frame skip env.seed(workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, max_timesteps=num_timesteps, gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00) env.close()
def train(env_id, num_timesteps, seed): from baselines.pposgd import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() logger.session().__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = bench.Monitor(env, osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, ) env.close()
def train(env_id, num_timesteps, seed, num_cpu): from baselines.pposgd import pposgd_simple, cnn_policy import baselines.common.tf_util as U whoami = mpi_fork(num_cpu) if whoami == "parent": return rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() logger.session().__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_train(env) num_timesteps /= 4 # because we're wrapping the envs to do frame skip env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') env.close()
def enjoy(env_id, num_timesteps, seed): from baselines.pposgd import mlp_policy, pposgd_simple sess = U.make_session(num_cpu=1) sess.__enter__() logger.session().__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = bench.Monitor(env, osp.join(logger.get_dir(), "monitor.json")) obs = env.reset() env.seed(seed) gym.logger.setLevel(logging.WARN) pi = policy_fn('pi', env.observation_space, env.action_space) tf.train.Saver().restore(sess, '/tmp/model') done = False while not done: action = pi.act(True, obs)[0] obs, reward, done, info = env.step(action) env.render()
def train(env_id, num_timesteps, seed): whoami = mpi_fork(num_cpu) if whoami == "parent": return import baselines.common.tf_util as U logger.session().__enter__() sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=32, num_hid_layers=2) env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def train(env_id, num_timesteps, seed, model_name, model_path, para,load_model, timesteps_per_batch,hidden_units,hidden_layers): whoami = mpi_fork(num_cpu) if whoami == "parent": return import baselines.common.tf_util as U logger.session().__enter__() sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) env = SubsetWrapper(env, para) #env = gym_kidney.LogWrapper(env, NN, EXP, OUT, FREQ, PARAM) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=hidden_units, num_hid_layers=hidden_layers) env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json" % rank)) # env.seed(workerseed) gym.logger.setLevel(logging.WARN) trpo_indi.learn(env, policy_fn, timesteps_per_batch=timesteps_per_batch, max_kl=max_kl, cg_iters=cg_iters, cg_damping=cd_damping, max_episodes=num_timesteps, gamma=gamma, lam=lam, vf_iters=vf_iters, vf_stepsize=vf_stepsize, load_model=load_model, model_path=model_path ) env.close()
def train(args): from baselines.pposgd import mlp_policy, pposgd_simple rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() logger.session(dir=args.exp_path, format_strs=None if rank == 0 and not args.test_only and not args.evaluate else []).__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = args.seed + 10000 * rank set_global_seeds(workerseed) if args.submit: env = SubmitRunEnv(visualize=args.render) elif args.submit_round2: from turnips.submit_round2_env import SubmitRunEnv2 submit_env = env = SubmitRunEnv2() elif args.simwalker: env = SimWalker(visualize=args.render) else: env = IsolatedMyRunEnv(visualize=args.render, run_logs_dir=args.run_logs_dir, additional_info={'exp_name': args.exp_name}, step_timeout=args.step_timeout, n_obstacles=args.n_obstacles, higher_pelvis=args.higher_pelvis) env = RunEnvWrapper(env, args.diff) if args.simwalker and args.log_simwalker: cls = type( "h5pyEnvLoggerClone", (gym.Wrapper, ), dict(h5pyEnvLogger.__dict__)) # workaround for double wrap problem env = cls(env, log_dir=args.run_logs_dir, filename_prefix='simwalker_', additional_info={ 'exp_name': args.exp_name, 'difficulty': args.diff, 'seed': args.seed }) env = env_walker = Walker(env, shaping_mode=args.shaping, transform_inputs=args.transform_inputs, obstacle_hack=not args.noobsthack, max_steps=args.max_env_steps, memory_size=args.memory_size, swap_legs_mode=args.swap_legs_mode, filter_obs=args.filter_obs, add_time=args.add_time, fall_penalty=args.fall_penalty, fall_penalty_value=args.fall_penalty_val, print_action=args.print_action, new8_fix=args.new8_fix, pause=args.pause, noisy_obstacles=args.noisy_obstacles, noisy_obstacles2=args.noisy_obstacles2, noisy_fix=args.noisy_fix) if args.log_walker: env = h5pyEnvLogger(env, log_dir=args.run_logs_dir, filename_prefix='walker_', additional_info={ 'exp_name': args.exp_name, 'difficulty': args.diff, 'seed': args.seed }) if args.muscles: env = MuscleWalker(env) if args.repeats > 1: env = RepeatActionsWalker(env, args.repeats) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=args.hid_size, num_hid_layers=args.num_hid_layers, bound_by_sigmoid=args.bound_by_sigmoid, sigmoid_coef=args.sigmoid_coef, activation=args.activation, normalize_obs=not args.nonormalize_obs, gaussian_fixed_var=not args.nogaussian_fixed_var, avg_norm_symmetry=args.avg_norm_symmetry, symmetric_interpretation=args.symmetric_interpretation, stdclip=args.stdclip, actions=args.actions, gaussian_bias=args.gaussian_bias, gaussian_from_binary=args.gaussian_from_binary, parallel_value=args.parallel_value, pv_layers=args.pv_layers, pv_hid_size=args.pv_hid_size, three=args.three) if not args.test_only and not args.evaluate: env = bench.Monitor(env, path.join(args.exp_path, "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) current_best = float('-inf') current_best_completed = float('-inf') current_best_perc_completed = float('-inf') stats_f = None start = time.time() def callback(local_, global_): nonlocal current_best nonlocal current_best_completed nonlocal current_best_perc_completed nonlocal stats_f if rank != 0: return if args.test_only or args.evaluate: return print('ELAPSED', time.time() - start) print(f'{socket.gethostname()}:{args.exp_path}') iter_no = local_['iters_so_far'] if iter_no % args.save_every == 0: U.save_state( path.join(args.exp_path, 'models', f'{iter_no:04d}', 'model')) if local_['iters_so_far'] == 0: stats_f = open(path.join(args.exp_path, 'simple_stats.csv'), 'w') cols = [ "Iter", "EpLenMean", "EpRewMean", "EpOrigRewMean", "EpThisIter", "EpisodesSoFar", "TimestepsSoFar", "TimeElapsed", "AvgCompleted", "PercCompleted" ] for name in local_['loss_names']: cols.append("loss_" + name) stats_f.write(",".join(cols) + '\n') else: current_orig_reward = np.mean(local_['origrew_buffer']) if current_best < current_orig_reward: print( f'Found better {current_best:.2f} -> {current_orig_reward:.2f}' ) current_best = current_orig_reward U.save_state(path.join(args.exp_path, 'best', 'model')) U.save_state(path.join(args.exp_path, 'last', 'model')) avg_completed = local_["avg_completed"] if current_best_completed < avg_completed: print( f'Found better completed {current_best_completed:.2f} -> {avg_completed:.2f}' ) current_best_completed = avg_completed U.save_state( path.join(args.exp_path, 'best_completed', 'model')) perc_completed = local_["perc_completed"] if current_best_perc_completed < perc_completed: print( f'Found better perc completed {current_best_perc_completed:.2f} -> {perc_completed:.2f}' ) current_best_perc_completed = perc_completed U.save_state( path.join(args.exp_path, 'perc_completed', 'model')) data = [ local_['iters_so_far'], np.mean(local_['len_buffer']), np.mean(local_['rew_buffer']), np.mean(local_['origrew_buffer']), len(local_['lens']), local_['episodes_so_far'], local_['timesteps_so_far'], time.time() - local_['tstart'], avg_completed, perc_completed, ] if 'meanlosses' in local_: for lossval in local_['meanlosses']: data.append(lossval) stats_f.write(",".join([str(x) for x in data]) + '\n') stats_f.flush() if args.load_model is not None: args.load_model += '/model' if args.submit_round2: submit_round2(env, submit_env, policy_fn, load_model_path=args.load_model, stochastic=False, actions=args.actions) #submit_env.submit() # submit_round2(...) submits already sys.exit() if args.evaluate: pposgd_simple.evaluate(env, policy_fn, load_model_path=args.load_model, n_episodes=args.n_eval_episodes, stochastic=not args.nostochastic, actions=args.actions, execute_just=args.execute_just) else: pposgd_simple.learn( env, policy_fn, max_timesteps=args.max_timesteps, timesteps_per_batch=args.timesteps_per_batch, clip_param=args.clip_param, entcoeff=args.entcoeff, optim_epochs=args.optim_epochs, optim_stepsize=args.optim_stepsize, optim_batchsize=args.optim_batchsize, gamma=args.gamma, lam=args.lam, callback=callback, load_model_path=args.load_model, test_only=args.test_only, stochastic=not args.nostochastic, symmetric_training=args.symmetric_training, obs_names=env_walker.obs_names, single_episode=args.single_episode, horizon_hack=args.horizon_hack, running_avg_len=args.running_avg_len, init_three=args.init_three, actions=args.actions, symmetric_training_trick=args.symmetric_training_trick, bootstrap_seeds=args.bootstrap_seeds, seeds_fn=args.seeds_fn, ) env.close()
cell = tf.nn.rnn_cell.BasicLSTMCell(64) rnn_out, last_state = tf.nn.dynamic_rnn(cell, out, dtype=np.float32) out = rnn_out[:, -1] #out = layers.fully_connected(out, num_outputs=16, activation_fn=tf.nn.tanh) out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) return out if __name__ == '__main__': np.random.seed(7) batch_size = 1 # Necessary, Different lengths log_session = logger.session(dir='logs') with U.make_session(8): # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: tf.placeholder(tf.int32, [None, None], name=name), q_func=model, num_actions=s.action_space, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000,
def on_iteration_start(local_vars, global_vars): on_iteration_start.iteration += 1 load_model(on_iteration_start.iteration) plot_history(local_vars['history'], on_iteration_start.iteration) save_model(on_iteration_start.iteration) on_iteration_start.iteration = 0 whoami = mpi_fork(args.cores) if whoami == 'parent': exit(0) session = U.single_threaded_session() session.__enter__() logger.session().__enter__() env = RunEnv(args.visualize, max_obstacles=args.obstacles, original_reward=args.original) env.spec.timestep_limit = args.max_steps if args.visualize: vis = env.osim_model.model.updVisualizer().updSimbodyVisualizer() vis.setBackgroundType(vis.GroundAndSky) vis.setShowFrameNumber(True) vis.zoomCameraToShowAllGeometry() vis.setCameraFieldOfView(1) if args.train: history = pposgd_simple.learn( env, policy_fn, max_timesteps=args.steps,