def train(env_id, num_timesteps, seed): from baselines.ppo_pnp import mlp_policy, pposgd_simple, interactive_ppo, ppo_gail U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3) env = JacoEnv(64, 64, 1, 1.0) #make_mujoco_env(env_id, seed) dataset = Mujoco_Dset(expert_path='data/pnp_demo.npz', traj_limitation=-1) reward_giver = TransitionClassifier(env, 100, entcoeff=1e-3) ppo_gail.learn( env, policy_fn, reward_giver, dataset, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) savedir_fname = learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, task_name=task_name, verbose=True) avg_len, avg_ret = runner(env, policy_fn, savedir_fname, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample, reuse=True)
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2, gaussian_fixed_var=False, obs_normalize=True) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) logger.configure( os.path.join("log", "BC", args.env_id, "subsample_{}".format(args.subsample_freq), "traj_{}".format(args.traj_limitation))) args.expert_path = 'dataset/{}.npz'.format(args.env_id).lower().replace( "-v1", "") # set expert path dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation, data_subsample_freq=args.subsample_freq) learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, verbose=True)
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train( env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, #false args.BC_max_iter, task_name) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample) else: raise NotImplementedError env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = robosuite.make(args.env_id, ignore_done=True, use_camera_obs=False, has_renderer=True, control_freq=100, gripper_visualization=True, reward_shaping=True, #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2 #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2 #box_pos = [0.23522776, 0.2287869, 0.82162434], #shift3 #box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3 #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4 #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4 #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5 #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5 #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6 #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6 box_pos = [0.43522776, -0.367869, 0.82162434], #shift7 box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7 ) # Switch from gym to robosuite, also add reward shaping to see reach goal env = GymWrapper(env) # wrap in the gym environment # Environment joints should be clipped at 1 and -1 for sawyer # Task #task = 'train' task = 'evaluate' # parser.add_argument('--task', type=str, choices=['train', 'evaluate', 'sample'], default='train') # Expert Path #expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/ac100/combined/combined_0.npz' # path for 100 trajectories expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/120_shift7/combined/combined_0.npz' # path for 100 trajectories #parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz') def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy_sawyer.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) #env.seed(args.seed) # Sawyer does not have seed gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) #if not os.path.isdir(args.log_dir): # os.makedirs(args.log_dir) logger.log("log_directories: ",args.log_dir) logger.log("environment action space range: ", env.action_space) #logging the action space if task == 'train': dataset = Mujoco_Dset(expert_path=expert_path, traj_limitation=args.traj_limitation) # Check dimensions of the dataset #print("dimension of inputs", dataset.dset.inputs.shape) # dims seem correct #print("dimension of inputs", dataset.dset.labels.shape) # dims seem correct reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name ) elif task == 'evaluate': # Create the playback environment play_env = robosuite.make(args.env_id, ignore_done=True, use_camera_obs=False, has_renderer=True, control_freq=100, gripper_visualization=True, #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2 #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2 #box_pos = [0.23522776, 0.2287869, 0.82162434], #shift3 #box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3 #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4 #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4 #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5 #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5 #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6 #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6 box_pos = [0.43522776, -0.367869, 0.82162434], #shift7 box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7 ) #play_env.viewer.set_camera(camera_id=2) # Switch views for eval runner(env, play_env, policy_fn, args.load_model_path, timesteps_per_batch=4000, # Change time step per batch to be more reasonable number_trajs=20, # change from 10 to 1 for evaluation stochastic_policy=args.stochastic_policy, save=args.save_sample ) else: raise NotImplementedError env.close()
def learn( *, network, env, eval_env, timesteps_per_batch=1000, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, num_epochs=1000, callback=None, load_path=None, log_dir=None, env_id=None, evaluation_freq=10, pretrain=False, expert_path=None, BC_max_iter=1e4, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' # Configure log. log_dir = os.path.join("log", "trpo", env_id, "pretrained_" + str(pretrain), str(seed)) logger.configure(dir=log_dir) if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session(config=tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker, )) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) # Pretrain. mujo_dataset = Mujoco_Dset(expert_path=expert_path) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) pretrained_weight = None if pretrain and (BC_max_iter > 0): # Pretrain with behavior cloning from baselines.trpo_mpi import behavior_clone pretrained_weight, pi = behavior_clone.learn(ob, policy, mujo_dataset, max_iters=BC_max_iter) evaluate_policy(pi, eval_env, -2, timesteps_per_batch, 0) else: with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers else: out = np.copy(x) return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # if provide pretrained weight if pretrained_weight is not None: U.load_variables(pretrained_weight, variables=tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, "pi")) evaluate_policy(pi, eval_env, -1, timesteps_per_batch, 0) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards for epoch in range(num_epochs): if callback: callback(locals(), globals()) logger.log("********** Epoch %i ************" % epoch) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate # if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) if epoch % evaluation_freq == 0: evaluate_policy(pi, eval_env, epoch, timesteps_per_batch, tstart) return pi
def load_dataset(expert_path): dataset = Mujoco_Dset(expert_path=expert_path) return dataset
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) import MujocoManip as MM if args.task == 'train': env_name, user_name = osp.basename( args.expert_path).split('.')[0].split('_') else: env_name, user_name = osp.basename(args.load_model_path).split('.')[:2] wrapper = '%sWrapper' % env_name render = True if args.task == 'evaluate' else False if env_name == 'SawyerLiftEnv': env = MM.make(wrapper, ignore_done=False, use_eef_ctrl=False, gripper_visualization=True, use_camera_obs=False, has_renderer=render, reward_shaping=True, has_offscreen_renderer=render) elif env_name == 'SawyerBinsEnv': env = MM.make( wrapper, ignore_done=False, use_eef_ctrl=False, gripper_visualization=True, use_camera_obs=False, has_renderer=render, reward_shaping=True, single_object_mode=False if 'hard' in user_name.lower() else True, has_offscreen_renderer=render) elif env_name == 'SawyerPegsEnv': env = MM.make( wrapper, ignore_done=False, use_eef_ctrl=False, gripper_visualization=True, use_camera_obs=False, has_renderer=render, reward_shaping=True, single_object_mode=False if 'hard' in user_name.lower() else True, has_offscreen_renderer=render) else: raise NotImplementedError def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(env_name, user_name) + '_%s_%s' % ( args.algo, 1 if not args.mix_reward else args.rew_lambda) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) os.makedirs(args.log_dir, exist_ok=True) if args.task == 'train': dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, args.rew_lambda, args.mix_reward, task_name, args.frame_stack) elif args.task == 'evaluate': visualizer(env, policy_fn, args.load_model_path, timesteps_per_batch=env.env.horizon, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample) else: raise NotImplementedError env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = robosuite.make( args.env_id, ignore_done=True, use_camera_obs=False, has_renderer=True, control_freq=100, gripper_visualization=True, reward_shaping=True, #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2 #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2 ) # Switch from gym to robosuite, also add reward shaping to see reach goal env = GymWrapper(env) # wrap in the gym environment #task = 'train' task = 'evaluate' # Expert Path expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/150_grasp_shift2/combined/combined_0.npz' # path for 100 trajectories #parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz') def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy_sawyer.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) # Note: taking away the bench monitor wrapping allows rendering #env.seed(args.seed) # Sawyer does not have seed gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) logger.log("log_directories: ", args.log_dir) logger.log("environment action space range: ", env.action_space) #logging the action space #------- Run policy for reaching ---------# play_env = robosuite.make( args.env_id, ignore_done=True, use_camera_obs=False, has_renderer=True, control_freq=100, gripper_visualization=True, #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2 #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2 ) play_env = GymWrapper(play_env) #Weights are loaded from reach model grasp_strange #play_env.viewer.set_camera(camera_id=2) # Switch views for eval # Setup network # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi_reach = policy_fn("pi", ob_space, ac_space, reuse=False) # Hack for loading policies using tensorflow init_op = tf.compat.v1.global_variables_initializer() saver = tf.compat.v1.train.Saver(max_to_keep=5) with tf.compat.v1.Session() as sess: sess.run(init_op) # Load Checkpoint ckpt_path = './reach_and_grasp_weights/reach_one/trpo_gail.transition_limitation_2100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' ckpt = tf.compat.v1.train.get_checkpoint_state(ckpt_path) saver.restore(sess, ckpt.model_checkpoint_path) # Create the playback environment _, _, last_ob, last_jpos = runner_1_traj( play_env, pi_reach, None, timesteps_per_batch=3500, number_trajs=1, stochastic_policy=args.stochastic_policy, save=False) if task == 'train': play_env.close() dataset = Mujoco_Dset(expert_path=expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train_grasp(env, last_ob, last_jpos, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name) elif task == 'evaluate': pi_grasp = policy_fn("pi_grasp", ob_space, ac_space, reuse=False) saver_2 = tf.compat.v1.train.Saver(max_to_keep=5) with tf.compat.v1.Session() as sess: sess.run(init_op) ckpt_path_2 = './reach_and_grasp_weights/grasp_shift1_after_reach/grasptrpo_gail.transition_limitation_2000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' ckpt_2 = tf.compat.v1.train.get_checkpoint_state(ckpt_path_2) saver_2.restore(sess, ckpt_2.model_checkpoint_path) tt = 0 ob = last_ob while True: ac, vpred = pi_grasp.act(False, ob) ob, rew, new, _ = play_env.step(ac) play_env.render() # check the running in for the first part #logger.log("rendering for reach policy") if new or tt >= args.traj_limitation: break tt += 1 play_env.close() env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) import MujocoManip as MM if args.task == 'train': env_name, user_name = osp.basename( args.expert_path).split('.')[0].split('_') else: uenv, user_name = osp.basename(args.load_model_path).split('.')[:2] env_name = uenv.split('_')[1] wrapper = '%sWrapper' % env_name render = True if args.task == 'evaluate' else False print('%s initialized.' % wrapper) bin_dict = dict(milk=0, bread=1, cereal=2, can=3) peg_dict = dict(square=0, round=1) if env_name == 'SawyerLiftEnv': env = MM.make(wrapper, ignore_done=False, use_eef_ctrl=False, gripper_visualization=True, use_camera_obs=False, has_renderer=render, reward_shaping=True, has_offscreen_renderer=False) elif env_name == 'SawyerBinsEnv': env = MM.make( wrapper, ignore_done=False, use_eef_ctrl=False, gripper_visualization=True, use_camera_obs=False, has_renderer=render, reward_shaping=True, single_object_mode=False if 'hard' in user_name.lower() else True, has_offscreen_renderer=False, selected_bin=None if 'hard' in user_name.lower() else bin_dict[user_name.lower()]) elif env_name == 'SawyerPegsEnv': env = MM.make(wrapper, ignore_done=False, use_eef_ctrl=False, gripper_visualization=True, use_camera_obs=False, has_renderer=render, reward_shaping=True, single_object_mode=False if user_name.lower() else True, has_offscreen_renderer=False, selected_bin=None if 'hard' in user_name.lower() else peg_dict[user_name.lower()]) else: raise NotImplementedError def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=3) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(env_name, user_name) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) savedir_fname = learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, task_name=task_name, verbose=True) elif args.task == 'evaluate': visualizer(env, policy_fn, args.load_model_path, env.env.horizon, 10, args.stochastic_policy, save=args.save_sample)
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) logger.configure() env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': from baselines.gail import mlp_policy def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) if args.states_only: reward_giver = WeakTransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) else: reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name, args.states_only ) elif args.task == 'evaluate': from baselines.gail import mlp_policy def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=args.traj_limitation, stochastic_policy=args.stochastic_policy, save=args.save_sample ) elif args.task == 'expert_train': from baselines.trpo_mpi import trpo_mpi as original_trpo from baselines.ppo1.mlp_policy import MlpPolicy as OriginalMlpPolicy def policy_fn(name, ob_space, ac_space, reuse=False): return OriginalMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=args.policy_hidden_size, num_hid_layers=2) original_trpo.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=args.num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) saver = tf.train.Saver() saver.save(tf.get_default_session(), args.save_model_path) elif args.task == 'expert_gen': from baselines.trpo_mpi import trpo_mpi as original_trpo from baselines.ppo1.mlp_policy import MlpPolicy as OriginalMlpPolicy def policy_fn(name, ob_space, ac_space, reuse=False): return OriginalMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=args.policy_hidden_size, num_hid_layers=2) runner(env, policy_fn, args.save_model_path, timesteps_per_batch=1024, number_trajs=args.traj_limitation, stochastic_policy=args.stochastic_policy, save=args.save_sample ) else: raise NotImplementedError env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) # env = DelayRewardWrapper(env, args.delay_freq, args.max_path_length) eval_env = gym.make(args.env_id) logger.configure( os.path.join("log", "GAIL", args.env_id, "subsample_{}".format(args.subsample_freq), "traj_{}".format(args.traj_limitation), "seed_{}".format(args.seed))) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2, gaussian_fixed_var=args.gaussian_fixed_var, obs_normalize=args.obs_normalize) env.seed(args.seed) eval_env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, "GAIL", task_name) if args.task == 'train': dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation, data_subsample_freq=args.subsample_freq) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff, obs_normalize=args.obs_normalize) train( env, eval_env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, args.num_epochs, args.evaluation_freq, args.timesteps_per_batch, task_name, ) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=args.timesteps_per_batch, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample) else: raise NotImplementedError env.close()
def setup_and_learn(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, actor, critic, classifier, normalize_returns, normalize_observations, critic_l2_reg, classifier_l2_reg, actor_lr, critic_lr, classifier_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, fifomemory, tau=0.01, eval_env=None, callback=None, entropy_coeff=1., reward_giver=None, expert_dataset=None, g_step=4, d_step=1, d_stepsize=3e-4, max_timesteps=0, max_iters=0, timesteps_per_batch=1024, adversary_hidden_size=100, adversary_entcoeff=1e-3, task='train', expert_path=None): # TODO: max_episodes """ set up learning agent and execute training """ logger.info('Initialize policy') logger.info('noisynet implementation of DDPG') assert task == 'train' assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG_paramnoise(actor, critic, classifier, memory, fifomemory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, critic_l2_reg=critic_l2_reg, classifier_l2_reg=classifier_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, classifier_lr=classifier_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, entropy_coeff=entropy_coeff) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) logger.info('Initialize Discriminator') reward_giver = TransitionClassifier(env, adversary_hidden_size, entcoeff=adversary_entcoeff) d_adam = MpiAdam(reward_giver.get_trainable_variables()) logger.info('Load Expert Data') dataset = Mujoco_Dset(expert_path=expert_path, traj_limitation=-1) # TODO: customize logger.info('Start training') with U.single_threaded_session() as sess: # init agent agent.initialize(sess) # tf saver saver = tf.train.Saver() # finalize graph sess.graph.finalize() learn( env, agent, reward_giver, dataset, g_step, d_step, d_stepsize=d_stepsize, timesteps_per_batch=timesteps_per_batch, nb_train_steps=nb_train_steps, max_timesteps=max_timesteps, max_iters=max_iters, # TODO: max_episodes callback=callback, d_adam=d_adam, sess=sess, saver=saver)
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) # delay training env # 返回next_obs,delay_reward(0/累积奖赏),done,info env = DelayRewardWrapper(env, args.reward_freq, 1000) # 评估Env,真实Env eval_env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) # 设置随机种子 env.seed(args.seed) eval_env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, "reward_coeff_" + str(args.reward_coeff), args.env_id, "seed_" + str(args.seed)) if args.task == 'train': dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train( env, eval_env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.reward_coeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, args.num_epochs, args.eval_interval, args.timesteps_per_batch, task_name, ) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=args.timesteps_per_batch, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample) else: raise NotImplementedError env.close()