def train( env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, clip_param=0.2, entcoeff=0.0, gamma=0.99, lam=0.95, optim_epochs=10, optim_batchsize=64, schedule='linear', bc_lr=1e-3, ppo_lr=3e-4, timesteps_per_actorbatch=1000, MPC=True, BEHAVIORAL_CLONING=True, PPO=True, ): start = time.time() logz.configure_output_dir(logdir) merged_summary, summary_writer, ppo_return_op, mpc_return_op, model_loss_op, reward_loss_op, ppo_std_op, mpc_std_op = build_summary_ops( logdir, env) print("-------- env info --------") print("Environment: ", FLAGS.env_name) print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print("action_space low: ", env.action_space.low) print("action_space high: ", env.action_space.high) print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING) print("PPO: ", PPO) print("MPC-AUG: ", MPC) print(" ") random_controller = RandomController(env) # Creat buffers model_data_buffer = DataBufferGeneral(FLAGS.MODELBUFFER_SIZE, 5) ppo_data_buffer = DataBufferGeneral(10000, 4) bc_data_buffer = DataBufferGeneral(2000, 2) # Random sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): model_data_buffer.add([ path['observations'][n], path['actions'][n], path['rewards'][n], path['next_observations'][n], path['next_observations'][n] - path['observations'][n] ]) print("model data buffer size: ", model_data_buffer.size) normalization = compute_normalization(model_data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # # tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config) policy_nn = MlpPolicy(sess=sess, env=env, hid_size=128, num_hid_layers=2, clip_param=clip_param, entcoeff=entcoeff) if FLAGS.LEARN_REWARD: print("Learn reward function") dyn_model = NNDynamicsRewardModel(env=env, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_ppo_controller = MPCcontrollerPolicyNetReward( env=env, dyn_model=dyn_model, explore=FLAGS.MPC_EXP, policy_net=policy_nn, self_exp=FLAGS.SELFEXP, horizon=mpc_horizon, num_simulated_paths=num_simulated_paths) else: print("Use predefined cost function") dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_ppo_controller = MPCcontrollerPolicyNet( env=env, dyn_model=dyn_model, explore=FLAGS.MPC_EXP, policy_net=policy_nn, self_exp=FLAGS.SELFEXP, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) # if not PPO: # mpc_ppo_controller = mpc_controller #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(logdir) if checkpoint and checkpoint.model_checkpoint_path and FLAGS.LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(logdir): os.mkdir(logdir) #======================================================== # # Prepare for rollouts # episodes_so_far = 0 timesteps_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards max_timesteps = num_paths_onpol * env_horizon bc = False ppo_mpc = False mpc_returns = 0 model_loss = 0 for itr in range(onpol_iters): print(" ") print("onpol_iters: ", itr) if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) print("bc learning_rate: ", bc_lr) print("ppo learning_rate: ", ppo_lr) ################## fit mpc model if MPC: model_loss, reward_loss = dyn_model.fit(model_data_buffer) ################## ppo seg data ppo_data_buffer.clear() # ppo_seg = traj_segment_generator_ppo(policy_nn, env, env_horizon) ppo_mpc = False mpc = False ppo_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon) add_vtarg_and_adv(ppo_seg, gamma, lam) ob, ac, rew, nxt_ob, atarg, tdlamret = \ ppo_seg["ob"], ppo_seg["ac"], ppo_seg["rew"], ppo_seg["nxt_ob"], ppo_seg["adv"], ppo_seg["tdlamret"] atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate # add into buffer for n in range(len(ob)): ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]]) model_data_buffer.add( [ob[n], ac[n], rew[n], nxt_ob[n], nxt_ob[n] - ob[n]]) ppo_std = np.std(ac, axis=0) print("ppo_std: ", ppo_std) ################## mpc augmented seg data if MPC: print("MPC AUG PPO") ppo_mpc = True mpc = True mpc_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon) add_vtarg_and_adv(mpc_seg, gamma, lam) ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = mpc_seg[ "ob"], mpc_seg["ac"], mpc_seg["mpcac"], mpc_seg[ "rew"], mpc_seg["nxt_ob"], mpc_seg["adv"], mpc_seg[ "tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate mpc_returns = mpc_seg["ep_rets"] mpc_std = np.std(mpcac) if not MPC: mpc_std = 0 ################## mpc random seg data if FLAGS.mpc_rand: print("MPC Random base policy") ppo_mpc = False mpc = True mpc_random_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon) add_vtarg_and_adv(mpc_random_seg, gamma, lam) ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = mpc_random_seg[ "ob"], mpc_random_seg["ac"], mpc_random_seg[ "mpcac"], mpc_random_seg["rew"], mpc_random_seg[ "nxt_ob"], mpc_random_seg["adv"], mpc_random_seg[ "tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate mpc_rand_returns = mpc_random_seg["ep_rets"] ################# PPO deterministic evaluation ppo_determinisitc_return = policy_net_eval(sess, env, policy_nn, env_horizon, stochastic=False) ################## optimization print("ppo_data_buffer size", ppo_data_buffer.size) print("bc_data_buffer size", bc_data_buffer.size) print("model data buffer size: ", model_data_buffer.size) # optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy policy_nn.assign_old_eq_new( ) # set old parameter values to new parameter values for op_ep in range(optim_epochs): # losses = [] # list of tuples, each of which gives the loss for a minibatch # for i in range(int(timesteps_per_actorbatch/optim_batchsize)): if PPO: sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample( optim_batchsize) newlosses = policy_nn.lossandupdate_ppo( sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, ppo_lr * cur_lrmult) # losses.append(newlosses) if BEHAVIORAL_CLONING and bc: sample_ob_no, sample_ac_na = bc_data_buffer.sample( optim_batchsize) # print("sample_ob_no", sample_ob_no.shape) # print("sample_ac_na", sample_ac_na.shape) policy_nn.update_bc(sample_ob_no, sample_ac_na, bc_lr * cur_lrmult) if op_ep % (100) == 0 and BEHAVIORAL_CLONING and bc: print('epcho: ', op_ep) policy_net_eval(sess, env, policy_nn, env_horizon) ################## print and save data seg = ppo_seg ep_lengths = seg["ep_lens"] returns = seg["ep_rets"] lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) episodes_so_far += len(lens) timesteps_so_far += sum(lens) # log ppo logz.log_tabular("TimeSoFar", time.time() - start) logz.log_tabular("TimeEp", time.time() - tstart) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.log_tabular("Condition", "PPO") logz.dump_tabular() # log ppo deterministic logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", ppo_determinisitc_return) logz.log_tabular("Condition", "PPO_DETERMINISTIC") logz.dump_tabular() # log mpc if MPC: logz.log_tabular("TimeSoFar", time.time() - start) logz.log_tabular("TimeEp", time.time() - tstart) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(mpc_returns)) logz.log_tabular("StdReturn", np.std(mpc_returns)) logz.log_tabular("MaxReturn", np.max(mpc_returns)) logz.log_tabular("MinReturn", np.min(mpc_returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.log_tabular("Condition", "MPC_PPO") logz.dump_tabular() if FLAGS.mpc_rand: logz.log_tabular("TimeSoFar", time.time() - start) logz.log_tabular("TimeEp", time.time() - tstart) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(mpc_rand_returns)) logz.log_tabular("StdReturn", np.std(mpc_rand_returns)) logz.log_tabular("MaxReturn", np.max(mpc_rand_returns)) logz.log_tabular("MinReturn", np.min(mpc_rand_returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.log_tabular("Condition", "MPC_RAND") logz.dump_tabular() # logz.pickle_tf_vars() tstart = time.time() ################### TF Summaries summary_str = sess.run(merged_summary, feed_dict={ ppo_return_op: np.mean(returns), mpc_return_op: np.mean(mpc_returns), model_loss_op: model_loss, ppo_std_op: ppo_std, reward_loss_op: reward_loss, mpc_std_op: mpc_std, }) summary_writer.add_summary(summary_str, itr) summary_writer.flush() ################ TF SAVE if itr % FLAGS.SAVE_ITER == 0 and itr != 0: save_path = saver.save(sess, logdir + "/model.ckpt") print("Model saved in path: %s" % save_path)
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, clip_param=0.2 , entcoeff=0.0, gamma=0.99, lam=0.95, optim_epochs=10, optim_batchsize=64, schedule='linear', bc_lr=1e-3, ppo_lr=3e-4, timesteps_per_actorbatch=1000, MPC = True, BEHAVIORAL_CLONING = True, PPO = True, ): start = time.time() logz.configure_output_dir(logdir) print("-------- env info --------") print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING) print("PPO: ", PPO) print("MPC-AUG: ", MPC) print(" ") # initialize buffers model_data_buffer = DataBufferGeneral(1000000, 5) ppo_data_buffer = DataBufferGeneral(10000, 4) bc_data_buffer = DataBufferGeneral(BC_BUFFER_SIZE, 2) # random sample path print("collecting random data ..... ") random_controller = RandomController(env) paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): model_data_buffer.add([path['observations'][n], path['actions'][n], path['rewards'][n], path['next_observations'][n], path['next_observations'][n] - path['observations'][n]]) print("model data buffer size: ", model_data_buffer.size) normalization = compute_normalization(model_data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # # tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config) dyn_model = NNDynamicsRewardModel(env=env, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = MlpPolicy(sess=sess, env=env, hid_size=256, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff) mpc_ppo_controller = MPCcontrollerPolicyNetReward(env=env, dyn_model=dyn_model, policy_net=policy_nn, self_exp=False, horizon=mpc_horizon, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) #======================================================== # # Prepare for rollouts # episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards max_timesteps = num_paths_onpol * env_horizon bc = False ppo_mpc = False mpc_returns = 0 for itr in range(onpol_iters): print(" ") print("onpol_iters: ", itr) if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) print("bc learning_rate: ", bc_lr) print("ppo learning_rate: ", ppo_lr) ################## fit mpc model if MPC: dyn_model.fit(model_data_buffer) ################## ppo seg data if PPO: ppo_data_buffer.clear() # ppo_seg = traj_segment_generator_ppo(policy_nn, env, env_horizon) mpc = False ppo_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon) add_vtarg_and_adv(ppo_seg, gamma, lam) ob, ac, rew, nxt_ob, atarg, tdlamret = \ ppo_seg["ob"], ppo_seg["ac"], ppo_seg["rew"], ppo_seg["nxt_ob"], ppo_seg["adv"], ppo_seg["tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate # add into buffer for n in range(len(ob)): ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]]) if MPC: model_data_buffer.add([ob[n], ac[n], rew[n], nxt_ob[n], nxt_ob[n]-ob[n]]) ################## mpc augmented seg data if itr % MPC_AUG_GAP == 0 and MPC: print("MPC AUG PPO") ppo_mpc = True mpc = True mpc_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon) add_vtarg_and_adv(mpc_seg, gamma, lam) ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = mpc_seg["ob"], mpc_seg["ac"], mpc_seg["mpcac"], mpc_seg["rew"], mpc_seg["nxt_ob"], mpc_seg["adv"], mpc_seg["tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate # add into buffer for n in range(len(ob)): # if PPO: # ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]]) if BEHAVIORAL_CLONING and bc: bc_data_buffer.add([ob[n], mpcac[n]]) if MPC: model_data_buffer.add([ob[n], mpcac[n], rew[n], nxt_ob[n], nxt_ob[n]-ob[n]]) mpc_returns = mpc_seg["ep_rets"] seg = ppo_seg # check if seg is good ep_lengths = seg["ep_lens"] returns = seg["ep_rets"] # saver.save(sess, CHECKPOINT_DIR) if BEHAVIORAL_CLONING: if np.mean(returns) > 100: bc = True else: bc = False print("BEHAVIORAL_CLONING: ", bc) bc_return = behavioral_cloning_eval(sess, env, policy_nn, env_horizon) if bc_return > 100: ppo_mpc = True else: ppo_mpc = False ################## optimization print("ppo_data_buffer size", ppo_data_buffer.size) print("bc_data_buffer size", bc_data_buffer.size) print("model data buffer size: ", model_data_buffer.size) # optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy policy_nn.assign_old_eq_new() # set old parameter values to new parameter values for op_ep in range(optim_epochs): # losses = [] # list of tuples, each of which gives the loss for a minibatch # for i in range(int(timesteps_per_actorbatch/optim_batchsize)): if PPO: sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample(optim_batchsize) newlosses = policy_nn.lossandupdate_ppo(sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, ppo_lr*cur_lrmult) # losses.append(newlosses) if BEHAVIORAL_CLONING and bc: sample_ob_no, sample_ac_na = bc_data_buffer.sample(optim_batchsize) # print("sample_ob_no", sample_ob_no.shape) # print("sample_ac_na", sample_ac_na.shape) policy_nn.update_bc(sample_ob_no, sample_ac_na, bc_lr*cur_lrmult) if op_ep % (100) == 0 and BEHAVIORAL_CLONING and bc: print('epcho: ', op_ep) behavioral_cloning_eval(sess, env, policy_nn, env_horizon) ################## print and save data lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 # if np.mean(returns) > 1000: # filename = "seg_data.pkl" # pickle.dump(seg, open(filename, 'wb')) # print("saved", filename) logz.log_tabular("TimeSoFar", time.time() - start) logz.log_tabular("TimeEp", time.time() - tstart) logz.log_tabular("Iteration", iters_so_far) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("MpcReturn", np.mean(mpc_returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.dump_tabular() logz.pickle_tf_vars() tstart = time.time()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, clip_param=0.2 , entcoeff=0.0, gamma=0.99, lam=0.95, optim_epochs=10, optim_batchsize=64, schedule='linear', bc_lr=1e-3, ppo_lr=3e-4, timesteps_per_actorbatch=1000, MPC = True, BEHAVIORAL_CLONING = True, PPO = True, ): start = time.time() print("-------- env info --------") print("Environment: ", FLAGS.env_name) print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print("action_space low: ", env.action_space.low) print("action_space high: ", env.action_space.high) print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING) print("PPO: ", PPO) print("MPC-AUG: ", MPC) print(" ") random_controller = RandomController(env) # Creat buffers model_data_buffer = DataBufferGeneral(FLAGS.MODELBUFFER_SIZE, 5) ppo_data_buffer = DataBufferGeneral(10000, 4) bc_data_buffer = DataBufferGeneral(2000, 2) # Random sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): model_data_buffer.add([path['observations'][n], path['actions'][n], path['rewards'][n], path['next_observations'][n], path['next_observations'][n] - path['observations'][n]]) print("model data buffer size: ", model_data_buffer.size) normalization = compute_normalization(model_data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # # tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config) policy_nn = MlpPolicy(sess=sess, env=env, hid_size=128, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff) if FLAGS.LEARN_REWARD: print("Learn reward function") dyn_model = NNDynamicsRewardModel(env=env, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_ppo_controller = MPCcontrollerPolicyNetReward(env=env, dyn_model=dyn_model, explore=FLAGS.MPC_EXP, policy_net=policy_nn, self_exp=FLAGS.SELFEXP, horizon=mpc_horizon, num_simulated_paths=num_simulated_paths) else: print("Use predefined cost function") dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_ppo_controller = MPCcontrollerPolicyNet(env=env, dyn_model=dyn_model, explore=FLAGS.MPC_EXP, policy_net=policy_nn, self_exp=FLAGS.SELFEXP, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) # if not PPO: # mpc_ppo_controller = mpc_controller #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(FLAGS.model_path) print("checkpoint", checkpoint) if checkpoint and checkpoint.model_checkpoint_path and FLAGS.LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(FLAGS.model_path): os.mkdir(FLAGS.model_path) #======================================================== # # Prepare for rollouts # tstart = time.time() states_true = [] states_predict = [] rewards_true = [] rewards_predict = [] ob = env.reset() ob_pre = np.expand_dims(ob, axis=0) states_true.append(ob) states_predict.append(ob_pre) for step in range(100): # ac = env.action_space.sample() # not used, just so we have the datatype ac, _ = policy_nn.act(ob, stochastic=True) ob, rew, done, _ = env.step(ac) ob_pre, r_pre = dyn_model.predict(ob_pre, ac) states_true.append(ob) rewards_true.append(rew) states_predict.append(ob_pre) rewards_predict.append(r_pre[0][0]) states_true = np.asarray(states_true) states_predict = np.asarray(states_predict) states_predict = np.squeeze(states_predict, axis=1) rewards_true = np.asarray(rewards_true) rewards_predict = np.asarray(rewards_predict) print("states_true", states_true.shape) print("states_predict", states_predict.shape) print("rewards_true", rewards_true.shape) print("rewards_predict", rewards_predict.shape) np.savetxt('./data/eval_model/states_true.out', states_true, delimiter=',') np.savetxt('./data/eval_model/states_predict.out', states_predict, delimiter=',') np.savetxt('./data/eval_model/rewards_true.out', rewards_true, delimiter=',') np.savetxt('./data/eval_model/rewards_predict.out', rewards_predict, delimiter=',')
def train_PG( exp_name='', env_name='', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=False, animate=True, logdir=None, normalize_advantages=False, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, # mb mpc arguments model_learning_rate=1e-3, onpol_iters=10, dynamics_iters=260, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=1000, env_horizon=1000, mpc_horizon=10, m_n_layers=2, m_size=500, ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment # env = gym.make(env_name) env = HalfCheetahEnvNew() cost_fn = cheetah_cost_fn activation=tf.nn.relu output_activation=None # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes # max_path_length = max_path_length or env.spec.max_episode_steps max_path_length = max_path_length # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # Print environment infomation print("-------- env info --------") print("Environment name: ", env_name) print("Action space is discrete: ", discrete) print("Action space dim: ", ac_dim) print("Observation space dim: ", ob_dim) print("Max_path_length ", max_path_length) #========================================================================================# # Random data collection #========================================================================================# random_controller = RandomController(env) data_buffer_model = DataBuffer() data_buffer_ppo = DataBuffer_general(10000, 4) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): data_buffer_model.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) print("data buffer size: ", data_buffer_model.size) normalization = compute_normalization(data_buffer_model) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto() tf_config.allow_soft_placement = True tf_config.intra_op_parallelism_threads =4 tf_config.inter_op_parallelism_threads = 1 sess = tf.Session(config=tf_config) dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = policy_network_ppo(sess, ob_dim, ac_dim, discrete, n_layers, size, learning_rate) if nn_baseline: value_nn = value_network(sess, ob_dim, n_layers, size, learning_rate) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) if MPC: dyn_model.fit(data_buffer_model) returns = [] costs = [] # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: # print("data buffer size: ", data_buffer_model.size) current_path = {'observations': [], 'actions': [], 'reward': [], 'next_observations':[]} ob = env.reset() obs, acs, mpc_acs, rewards = [], [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 return_ = 0 while True: # print("steps ", steps) if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) if MPC: mpc_ac = mpc_controller.get_action(ob) else: mpc_ac = random_controller.get_action(ob) ac = policy_nn.predict(ob, mpc_ac) ac = ac[0] if not PG: ac = mpc_ac acs.append(ac) mpc_acs.append(mpc_ac) current_path['observations'].append(ob) ob, rew, done, _ = env.step(ac) current_path['reward'].append(rew) current_path['actions'].append(ac) current_path['next_observations'].append(ob) return_ += rew rewards.append(rew) steps += 1 if done or steps > max_path_length: break if MPC: # cost & return cost = path_cost(cost_fn, current_path) costs.append(cost) returns.append(return_) print("total return: ", return_) print("costs: ", cost) # add into buffers for n in range(len(current_path['observations'])): data_buffer_model.add(current_path['observations'][n], current_path['actions'][n], current_path['next_observations'][n]) for n in range(len(current_path['observations'])): data_buffer_ppo.add(current_path['observations'][n], current_path['actions'][n], current_path['reward'][n], current_path['next_observations'][n]) path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs), "mpc_action" : np.array(mpc_acs)} paths.append(path) timesteps_this_batch += pathlength(path) # print("timesteps_this_batch", timesteps_this_batch) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch print("data_buffer_ppo.size:", data_buffer_ppo.size) # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) mpc_ac_na = np.concatenate([path["mpc_action"] for path in paths]) # Computing Q-values if reward_to_go: q_n = [] for path in paths: for t in range(len(path["reward"])): t_ = 0 q = 0 while t_ < len(path["reward"]): if t_ >= t: q += gamma**(t_-t) * path["reward"][t_] t_ += 1 q_n.append(q) q_n = np.asarray(q_n) else: q_n = [] for path in paths: for t in range(len(path["reward"])): t_ = 0 q = 0 while t_ < len(path["reward"]): q += gamma**t_ * path["reward"][t_] t_ += 1 q_n.append(q) q_n = np.asarray(q_n) # Computing Baselines if nn_baseline: # b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no :ob_no}) b_n = value_nn.predict(ob_no) b_n = normalize(b_n) b_n = denormalize(b_n, np.std(q_n), np.mean(q_n)) adv_n = q_n - b_n else: adv_n = q_n.copy() # Advantage Normalization if normalize_advantages: adv_n = normalize(adv_n) # Optimizing Neural Network Baseline if nn_baseline: b_n_target = normalize(q_n) value_nn.fit(ob_no, b_n_target) # sess.run(baseline_update_op, feed_dict={sy_ob_no :ob_no, sy_baseline_target_n:b_n_target}) # Performing the Policy Update # policy_nn.fit(ob_no, ac_na, adv_n) policy_nn.fit(ob_no, ac_na, adv_n, mpc_ac_na) # sess.run(update_op, feed_dict={sy_ob_no :ob_no, sy_ac_na:ac_na, sy_adv_n:adv_n}) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train( env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, clip_param=0.2, entcoeff=0.0, gamma=0.99, lam=0.95, optim_epochs=10, optim_batchsize=64, schedule='linear', optim_stepsize=3e-4, timesteps_per_actorbatch=1000, BEHAVIORAL_CLONING=True, PPO=True, ): start = time.time() logz.configure_output_dir(logdir) print("-------- env info --------") print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING) print("PPO: ", PPO) print(" ") random_controller = RandomController(env) model_data_buffer = DataBuffer() ppo_data_buffer = DataBuffer_general(BC_BUFFER_SIZE, 6) bc_data_buffer = DataBuffer_general(BC_BUFFER_SIZE, 2) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): model_data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) print("model data buffer size: ", model_data_buffer.size) normalization = compute_normalization(model_data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = MlpPolicy_bc(sess=sess, env=env, hid_size=64, num_hid_layers=2, clip_param=clip_param, entcoeff=entcoeff) bc_net = BCnetwork(sess, env, BATCH_SIZE_BC, learning_rate) mpc_controller_bc_ppo = MPCcontroller_BC_PPO( env=env, dyn_model=dyn_model, bc_ppo_network=policy_nn, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) #======================================================== # # Prepare for rollouts # episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards max_timesteps = num_paths_onpol * env_horizon for itr in range(onpol_iters): print("onpol_iters: ", itr) dyn_model.fit(model_data_buffer) if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) # saver.save(sess, CHECKPOINT_DIR) behavioral_cloning_eval(sess, env, policy_nn, env_horizon) ppo_data_buffer.clear() seg = traj_segment_generator(policy_nn, mpc_controller, mpc_controller_bc_ppo, bc_data_buffer, env, env_horizon) add_vtarg_and_adv(seg, gamma, lam) ob, ac, rew, nxt_ob, atarg, tdlamret = seg["ob"], seg["ac"], seg[ "rew"], seg["nxt_ob"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate for n in range(len(ob)): ppo_data_buffer.add( (ob[n], ac[n], rew[n], nxt_ob[n], atarg[n], tdlamret[n])) bc_data_buffer.add((ob[n], ac[n])) model_data_buffer.add(ob[n], ac[n], nxt_ob[n]) print("ppo_data_buffer size", ppo_data_buffer.size) print("bc_data_buffer size", bc_data_buffer.size) print("model data buffer size: ", model_data_buffer.size) # optim_batchsize = optim_batchsize or ob.shape[0] # behavioral_cloning(sess, env, bc_net, mpc_controller, env_horizon, bc_data_buffer, Training_epoch=1000) if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy policy_nn.assign_old_eq_new( ) # set old parameter values to new parameter values for op_ep in range(optim_epochs): # losses = [] # list of tuples, each of which gives the loss for a minibatch # for i in range(int(timesteps_per_actorbatch/optim_batchsize)): if PPO: sample_ob_no, sample_ac_na, sample_rew, sample_nxt_ob_no, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample( optim_batchsize) newlosses = policy_nn.lossandupdate_ppo( sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, optim_stepsize * cur_lrmult) # losses.append(newlosses) if BEHAVIORAL_CLONING: sample_ob_no, sample_ac_na = bc_data_buffer.sample( optim_batchsize) policy_nn.update_bc(sample_ob_no, sample_ac_na, optim_stepsize * cur_lrmult) if op_ep % 100 == 0: print('epcho: ', op_ep) behavioral_cloning_eval(sess, env, policy_nn, env_horizon) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 ep_lengths = seg["ep_lens"] returns = seg["ep_rets"] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", iters_so_far) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.dump_tabular() logz.pickle_tf_vars()