def get_test_batch(self, num_tasks, resample=False, task=None, controller='Rand', task_range=(0, 7), task_fun=np.random.randint): if controller == 'Rand': self.controller = RandomController(self.env) elif controller == "MPC": self.controller = MPCcontroller(self.env) if resample: # random sample if task is None: learner_env_goals = sample_goals(num_tasks, task_range, task_fun) else: learner_env_goals = task for i in range(num_tasks): task = learner_env_goals[i] paths = sample(self.env, task, self.controller, num_paths=self.num_paths_random, horizon=self.env_horizon, ignore_done=True, K=self.K, M=self.M) # 10 data_x, data_y = self._data_process(paths) data_x = data_x[np.newaxis, :] data_y = data_y[np.newaxis, :] if i == 0: x = data_x y = data_y else: x = np.concatenate([x, data_x], axis=0) y = np.concatenate([y, data_y], axis=0) data_x, data_y = [], [] for t in range(num_tasks): for h in range(self.env_horizon): data_x.append(x[t, h:(h + self.K + self.M), :]) data_y.append(y[t, h:(h + self.K + self.M), :]) data_x = np.array(data_x) data_y = np.array(data_y) # dataset = tf.data.Dataset.from_tensor_slices((data_x, data_y)).shuffle( # buffer_size=self.env_horizon * self.num_tasks).batch( # self.env_horizon).repeat() # # create the iterator # iter = dataset.make_one_shot_iterator() # # iterator = iter.get_next() return data_x, data_y
def get_dataset(self, resample=False, task=None, controller='Rand', task_range=(0, 7), task_fun=np.random.randint): if controller == 'Rand': self.controller = RandomController(self.env) elif controller == "MPC": self.controller = MPCcontroller(self.env) if resample: # random sample if task is None: learner_env_goals = sample_goals(self.num_tasks, task_range, task_fun) else: learner_env_goals = task for i in range(self.num_tasks): task = learner_env_goals[i] paths = sample(self.env, task, self.controller, num_paths=self.num_paths_random, horizon=self.env_horizon, ignore_done=True, K=self.K, M=self.M) # 10 data_x, data_y = self._data_process(paths) data_x = data_x[np.newaxis, :] data_y = data_y[np.newaxis, :] if i == 0: self.x = data_x self.y = data_y else: self.x = np.concatenate([self.x, data_x], axis=0) self.y = np.concatenate([self.y, data_y], axis=0) # end = time.time() # runtime1 = end - start # print('time ', runtime1) print('env_horizon:', self.env_horizon) print('len of x:', len(self.x)) return len(self.x)
def train( env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, clip_param=0.2, entcoeff=0.0, gamma=0.99, lam=0.95, optim_epochs=10, optim_batchsize=64, schedule='linear', bc_lr=1e-3, ppo_lr=3e-4, timesteps_per_actorbatch=1000, MPC=True, BEHAVIORAL_CLONING=True, PPO=True, ): start = time.time() logz.configure_output_dir(logdir) merged_summary, summary_writer, ppo_return_op, mpc_return_op, model_loss_op, reward_loss_op, ppo_std_op, mpc_std_op = build_summary_ops( logdir, env) print("-------- env info --------") print("Environment: ", FLAGS.env_name) print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print("action_space low: ", env.action_space.low) print("action_space high: ", env.action_space.high) print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING) print("PPO: ", PPO) print("MPC-AUG: ", MPC) print(" ") random_controller = RandomController(env) # Creat buffers model_data_buffer = DataBufferGeneral(FLAGS.MODELBUFFER_SIZE, 5) ppo_data_buffer = DataBufferGeneral(10000, 4) bc_data_buffer = DataBufferGeneral(2000, 2) # Random sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): model_data_buffer.add([ path['observations'][n], path['actions'][n], path['rewards'][n], path['next_observations'][n], path['next_observations'][n] - path['observations'][n] ]) print("model data buffer size: ", model_data_buffer.size) normalization = compute_normalization(model_data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # # tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config) policy_nn = MlpPolicy(sess=sess, env=env, hid_size=128, num_hid_layers=2, clip_param=clip_param, entcoeff=entcoeff) if FLAGS.LEARN_REWARD: print("Learn reward function") dyn_model = NNDynamicsRewardModel(env=env, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_ppo_controller = MPCcontrollerPolicyNetReward( env=env, dyn_model=dyn_model, explore=FLAGS.MPC_EXP, policy_net=policy_nn, self_exp=FLAGS.SELFEXP, horizon=mpc_horizon, num_simulated_paths=num_simulated_paths) else: print("Use predefined cost function") dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_ppo_controller = MPCcontrollerPolicyNet( env=env, dyn_model=dyn_model, explore=FLAGS.MPC_EXP, policy_net=policy_nn, self_exp=FLAGS.SELFEXP, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) # if not PPO: # mpc_ppo_controller = mpc_controller #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(logdir) if checkpoint and checkpoint.model_checkpoint_path and FLAGS.LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(logdir): os.mkdir(logdir) #======================================================== # # Prepare for rollouts # episodes_so_far = 0 timesteps_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards max_timesteps = num_paths_onpol * env_horizon bc = False ppo_mpc = False mpc_returns = 0 model_loss = 0 for itr in range(onpol_iters): print(" ") print("onpol_iters: ", itr) if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) print("bc learning_rate: ", bc_lr) print("ppo learning_rate: ", ppo_lr) ################## fit mpc model if MPC: model_loss, reward_loss = dyn_model.fit(model_data_buffer) ################## ppo seg data ppo_data_buffer.clear() # ppo_seg = traj_segment_generator_ppo(policy_nn, env, env_horizon) ppo_mpc = False mpc = False ppo_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon) add_vtarg_and_adv(ppo_seg, gamma, lam) ob, ac, rew, nxt_ob, atarg, tdlamret = \ ppo_seg["ob"], ppo_seg["ac"], ppo_seg["rew"], ppo_seg["nxt_ob"], ppo_seg["adv"], ppo_seg["tdlamret"] atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate # add into buffer for n in range(len(ob)): ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]]) model_data_buffer.add( [ob[n], ac[n], rew[n], nxt_ob[n], nxt_ob[n] - ob[n]]) ppo_std = np.std(ac, axis=0) print("ppo_std: ", ppo_std) ################## mpc augmented seg data if MPC: print("MPC AUG PPO") ppo_mpc = True mpc = True mpc_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon) add_vtarg_and_adv(mpc_seg, gamma, lam) ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = mpc_seg[ "ob"], mpc_seg["ac"], mpc_seg["mpcac"], mpc_seg[ "rew"], mpc_seg["nxt_ob"], mpc_seg["adv"], mpc_seg[ "tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate mpc_returns = mpc_seg["ep_rets"] mpc_std = np.std(mpcac) if not MPC: mpc_std = 0 ################## mpc random seg data if FLAGS.mpc_rand: print("MPC Random base policy") ppo_mpc = False mpc = True mpc_random_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon) add_vtarg_and_adv(mpc_random_seg, gamma, lam) ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = mpc_random_seg[ "ob"], mpc_random_seg["ac"], mpc_random_seg[ "mpcac"], mpc_random_seg["rew"], mpc_random_seg[ "nxt_ob"], mpc_random_seg["adv"], mpc_random_seg[ "tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate mpc_rand_returns = mpc_random_seg["ep_rets"] ################# PPO deterministic evaluation ppo_determinisitc_return = policy_net_eval(sess, env, policy_nn, env_horizon, stochastic=False) ################## optimization print("ppo_data_buffer size", ppo_data_buffer.size) print("bc_data_buffer size", bc_data_buffer.size) print("model data buffer size: ", model_data_buffer.size) # optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy policy_nn.assign_old_eq_new( ) # set old parameter values to new parameter values for op_ep in range(optim_epochs): # losses = [] # list of tuples, each of which gives the loss for a minibatch # for i in range(int(timesteps_per_actorbatch/optim_batchsize)): if PPO: sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample( optim_batchsize) newlosses = policy_nn.lossandupdate_ppo( sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, ppo_lr * cur_lrmult) # losses.append(newlosses) if BEHAVIORAL_CLONING and bc: sample_ob_no, sample_ac_na = bc_data_buffer.sample( optim_batchsize) # print("sample_ob_no", sample_ob_no.shape) # print("sample_ac_na", sample_ac_na.shape) policy_nn.update_bc(sample_ob_no, sample_ac_na, bc_lr * cur_lrmult) if op_ep % (100) == 0 and BEHAVIORAL_CLONING and bc: print('epcho: ', op_ep) policy_net_eval(sess, env, policy_nn, env_horizon) ################## print and save data seg = ppo_seg ep_lengths = seg["ep_lens"] returns = seg["ep_rets"] lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) episodes_so_far += len(lens) timesteps_so_far += sum(lens) # log ppo logz.log_tabular("TimeSoFar", time.time() - start) logz.log_tabular("TimeEp", time.time() - tstart) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.log_tabular("Condition", "PPO") logz.dump_tabular() # log ppo deterministic logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", ppo_determinisitc_return) logz.log_tabular("Condition", "PPO_DETERMINISTIC") logz.dump_tabular() # log mpc if MPC: logz.log_tabular("TimeSoFar", time.time() - start) logz.log_tabular("TimeEp", time.time() - tstart) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(mpc_returns)) logz.log_tabular("StdReturn", np.std(mpc_returns)) logz.log_tabular("MaxReturn", np.max(mpc_returns)) logz.log_tabular("MinReturn", np.min(mpc_returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.log_tabular("Condition", "MPC_PPO") logz.dump_tabular() if FLAGS.mpc_rand: logz.log_tabular("TimeSoFar", time.time() - start) logz.log_tabular("TimeEp", time.time() - tstart) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(mpc_rand_returns)) logz.log_tabular("StdReturn", np.std(mpc_rand_returns)) logz.log_tabular("MaxReturn", np.max(mpc_rand_returns)) logz.log_tabular("MinReturn", np.min(mpc_rand_returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.log_tabular("Condition", "MPC_RAND") logz.dump_tabular() # logz.pickle_tf_vars() tstart = time.time() ################### TF Summaries summary_str = sess.run(merged_summary, feed_dict={ ppo_return_op: np.mean(returns), mpc_return_op: np.mean(mpc_returns), model_loss_op: model_loss, ppo_std_op: ppo_std, reward_loss_op: reward_loss, mpc_std_op: mpc_std, }) summary_writer.add_summary(summary_str, itr) summary_writer.flush() ################ TF SAVE if itr % FLAGS.SAVE_ITER == 0 and itr != 0: save_path = saver.save(sess, logdir + "/model.ckpt") print("Model saved in path: %s" % save_path)
def main(args): tf.set_random_seed(args.seed) np.random.seed(args.seed) env_name = args.env_name # HalfCheetah-v2 My3LineDirect-v1 print(env_name) if args.env_name == 'HalfCheetahEnvDisableEnv-v0': cost_fn = cheetah_cost_fn sample_task_fun = np.random.randint elif args.env_name == 'HalfCheetahVaryingEnv-v0': cost_fn = cheetah_cost_fn sample_task_fun = np.random.uniform else: print('env is error!!! ') env = gym.make(env_name) dim_input = env.observation_space.shape[0] + env.action_space.shape[0] dim_output = env.observation_space.shape[0] logdir = configure_log_dir(logname=env_name, txt=args.note) # save args prameters with open(logdir + '/info.txt', 'wt') as f: print('Hello World!\n', file=f) print(args, file=f) mpc_horizon = args.mpc_horizon num_simulated_paths = args.simulated_paths #10000 dyn_model = Dynamics( args.env_name, args.NumOfExp, args.model_type, args.loss_type, dim_input, dim_output, beta=args.beta, #args.beta, max_epochs=args.max_epochs, is_train=args.is_train, norm=args.norm, task_Note=args.note, restore_checkpoint=args.restore_checkpoint, restore_dir=args.restore_dir, logdir=logdir) mpc_controller = MPCcontroller( env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths, ) logger = Logger(logdir, csvname='log') num_itr = args.num_itr experiences, costs = [], [] print('MPC is beginning...') for itr in range(num_itr): reward, model_loss_mean = rollout( env, mpc_controller, task_goal=args.task_goal, dyn_model=dyn_model, experiences=experiences, NumOfExp=args.NumOfExp, horizon=args.horizon, cost_fn=cheetah_cost_fn, render=False, verbose=False, save_video=False, ignore_done=True, ) #print(time.asctime( time.localtime(time.time()) ), ' itr :', itr, 'Average reward :' , cost) log.infov( "Itr {}/{} Accumulated Reward: {:.4f} Model loss mean:{:.4f}". format(itr, num_itr, reward, model_loss_mean)) logger.log({ 'itr': itr, 'Accumulated Reward': reward, 'Model loss mean': model_loss_mean, }) print('MPC is over....') logger.write(display=False)
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, clip_param=0.2 , entcoeff=0.0, gamma=0.99, lam=0.95, optim_epochs=10, optim_batchsize=64, schedule='linear', bc_lr=1e-3, ppo_lr=3e-4, timesteps_per_actorbatch=1000, MPC = True, BEHAVIORAL_CLONING = True, PPO = True, ): start = time.time() logz.configure_output_dir(logdir) print("-------- env info --------") print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING) print("PPO: ", PPO) print("MPC-AUG: ", MPC) print(" ") # initialize buffers model_data_buffer = DataBufferGeneral(1000000, 5) ppo_data_buffer = DataBufferGeneral(10000, 4) bc_data_buffer = DataBufferGeneral(BC_BUFFER_SIZE, 2) # random sample path print("collecting random data ..... ") random_controller = RandomController(env) paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): model_data_buffer.add([path['observations'][n], path['actions'][n], path['rewards'][n], path['next_observations'][n], path['next_observations'][n] - path['observations'][n]]) print("model data buffer size: ", model_data_buffer.size) normalization = compute_normalization(model_data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # # tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config) dyn_model = NNDynamicsRewardModel(env=env, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = MlpPolicy(sess=sess, env=env, hid_size=256, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff) mpc_ppo_controller = MPCcontrollerPolicyNetReward(env=env, dyn_model=dyn_model, policy_net=policy_nn, self_exp=False, horizon=mpc_horizon, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) #======================================================== # # Prepare for rollouts # episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards max_timesteps = num_paths_onpol * env_horizon bc = False ppo_mpc = False mpc_returns = 0 for itr in range(onpol_iters): print(" ") print("onpol_iters: ", itr) if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) print("bc learning_rate: ", bc_lr) print("ppo learning_rate: ", ppo_lr) ################## fit mpc model if MPC: dyn_model.fit(model_data_buffer) ################## ppo seg data if PPO: ppo_data_buffer.clear() # ppo_seg = traj_segment_generator_ppo(policy_nn, env, env_horizon) mpc = False ppo_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon) add_vtarg_and_adv(ppo_seg, gamma, lam) ob, ac, rew, nxt_ob, atarg, tdlamret = \ ppo_seg["ob"], ppo_seg["ac"], ppo_seg["rew"], ppo_seg["nxt_ob"], ppo_seg["adv"], ppo_seg["tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate # add into buffer for n in range(len(ob)): ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]]) if MPC: model_data_buffer.add([ob[n], ac[n], rew[n], nxt_ob[n], nxt_ob[n]-ob[n]]) ################## mpc augmented seg data if itr % MPC_AUG_GAP == 0 and MPC: print("MPC AUG PPO") ppo_mpc = True mpc = True mpc_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon) add_vtarg_and_adv(mpc_seg, gamma, lam) ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = mpc_seg["ob"], mpc_seg["ac"], mpc_seg["mpcac"], mpc_seg["rew"], mpc_seg["nxt_ob"], mpc_seg["adv"], mpc_seg["tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate # add into buffer for n in range(len(ob)): # if PPO: # ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]]) if BEHAVIORAL_CLONING and bc: bc_data_buffer.add([ob[n], mpcac[n]]) if MPC: model_data_buffer.add([ob[n], mpcac[n], rew[n], nxt_ob[n], nxt_ob[n]-ob[n]]) mpc_returns = mpc_seg["ep_rets"] seg = ppo_seg # check if seg is good ep_lengths = seg["ep_lens"] returns = seg["ep_rets"] # saver.save(sess, CHECKPOINT_DIR) if BEHAVIORAL_CLONING: if np.mean(returns) > 100: bc = True else: bc = False print("BEHAVIORAL_CLONING: ", bc) bc_return = behavioral_cloning_eval(sess, env, policy_nn, env_horizon) if bc_return > 100: ppo_mpc = True else: ppo_mpc = False ################## optimization print("ppo_data_buffer size", ppo_data_buffer.size) print("bc_data_buffer size", bc_data_buffer.size) print("model data buffer size: ", model_data_buffer.size) # optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy policy_nn.assign_old_eq_new() # set old parameter values to new parameter values for op_ep in range(optim_epochs): # losses = [] # list of tuples, each of which gives the loss for a minibatch # for i in range(int(timesteps_per_actorbatch/optim_batchsize)): if PPO: sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample(optim_batchsize) newlosses = policy_nn.lossandupdate_ppo(sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, ppo_lr*cur_lrmult) # losses.append(newlosses) if BEHAVIORAL_CLONING and bc: sample_ob_no, sample_ac_na = bc_data_buffer.sample(optim_batchsize) # print("sample_ob_no", sample_ob_no.shape) # print("sample_ac_na", sample_ac_na.shape) policy_nn.update_bc(sample_ob_no, sample_ac_na, bc_lr*cur_lrmult) if op_ep % (100) == 0 and BEHAVIORAL_CLONING and bc: print('epcho: ', op_ep) behavioral_cloning_eval(sess, env, policy_nn, env_horizon) ################## print and save data lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 # if np.mean(returns) > 1000: # filename = "seg_data.pkl" # pickle.dump(seg, open(filename, 'wb')) # print("saved", filename) logz.log_tabular("TimeSoFar", time.time() - start) logz.log_tabular("TimeEp", time.time() - tstart) logz.log_tabular("Iteration", iters_so_far) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("MpcReturn", np.mean(mpc_returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.dump_tabular() logz.pickle_tf_vars() tstart = time.time()
def train(env, cost_fn, exp_name='test', logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arg: onpol_iters: Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters: Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size: Batch size for dynamics training. num_paths_random: Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol: Number of paths to collect at each iteration of |_ aggregation, using the MPC policy. num_simulated_paths: How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon: Number of timesteps in each path. mpc_horizon: The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations: Neural network architecture arguments. """ # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train)[0] locals_ = locals() locals_['cost_fn'] = 'cost_fn' locals_['activation'] = 'activation' locals_['env'] = 'env' params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) paths = sample(env=env, controller=random_controller, num_paths=num_paths_random, horizon=env_horizon, verbose=False) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # normalization = { "observations": compute_normalization(paths["observations"]), "actions": compute_normalization(paths["actions"]), "deltas": compute_normalization(paths["next_observations"] - paths["observations"]) } #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration # refitting the dynamics model to current dataset and then taking onpolicy # samples and aggregating to the dataset. # TODO: implement mixing ratio for new and old data as described in # https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): shuffle_indexes = np.random.permutation(paths["observations"].shape[0]) for key in ['observations', 'actions', 'next_observations', 'rewards']: paths[key] = paths[key][shuffle_indexes] dyn_model.fit(paths) newpaths = sample(env=env, controller=mpc_controller, num_paths=num_paths_onpol, horizon=env_horizon, verbose=False) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model costs = path_cost(cost_fn, newpaths) returns = newpaths["acc_rewards"] logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory # using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular() for key in ['observations', 'actions', 'next_observations', 'rewards']: paths[key] = np.concatenate([paths[key], newpaths[key]])
} for tag, value in info.items(): logger.scalar_summary(tag, value, i + 1) print('Epoch ', (epoch + 1), '/', epoch_size, 'Train loss %.3f' % loss_train.data[0], 'Validation loss %.3f' % loss.data[0]) env = gym.make(env_name) mpc_controller = MPCcontroller( env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths, ) dataset = MotionDataset(Trainset_file) train_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=0) test_dataset = MotionDataset(Testset_file) test_loader = DataLoader(dataset=test_dataset, batch_size=test_dataset.len, shuffle=True, num_workers=0)
def main(): nb_total_steps = 1000 nb_iterations = 40 hidden_layers = [256, 256] writer = tensorboardX.SummaryWriter() args = parse_args(__doc__, ['env']) env = gym.make(args.env) ctrl = rand_ctrl = RandomController(env) # ipdb.set_trace() print('#inputs : %d' % ctrl.nb_inputs()) print('#actions: %d' % ctrl.nb_actions()) # f_net = make_net( # [ctrl.nb_inputs() + ctrl.nb_actions()] + hidden_layers + [ctrl.nb_inputs()], # [nn.ReLU() for _ in hidden_layers], # ) f_net = MOENetwork( nb_inputs=ctrl.nb_inputs() + ctrl.nb_actions(), nb_experts=4, gait_layers=[64], expert_layers=[64, ctrl.nb_inputs()], ) data = collect_data(env, ctrl, nb_total_steps*10) # ipdb.set_trace() dynamics = DynamicsModel(env, f_net, data.get_all(), writer=writer) # cost_func = lambda s,a,sn: -sn[3].item() # refers to vx cost_func = get_cost(args.env) # refers to vx # data.calc_normalizations() # dynamics.fit(data) mpc_ctrl = MPCcontroller(env, dynamics.predict, cost_func, num_simulated_paths=100, horizon=10, num_mpc_steps=10) eval_args = EvaluationArgs(nb_burnin_steps=4, nb_episodes=10, horizons=[1, 2, 4, 8, 16, 32]) for i in range(nb_iterations): print('Iteration', i) new_data = collect_data(env, ctrl, nb_total_steps) dynamics.fit(*new_data.get_all()) data.extend(new_data) dynamics.fit(*data.sample(sample_size=4*nb_total_steps)) evaluate_and_log_dynamics( dynamics.predict, env, rand_ctrl, writer=writer, i_step=i, args=eval_args ) evaluate_and_log_dynamics( dynamics.predict, env, mpc_ctrl, writer=writer, i_step=i, args=eval_args ) # dynamics.fit(*data.get_all()) if random.random() > 0.5: ctrl = rand_ctrl else: ctrl = mpc_ctrl env = gym.make(args.env) ctrl = MPCcontroller(env, dynamics.predict, cost_func, num_simulated_paths=1000, num_mpc_steps=4) # TODO env.render(mode='human') obs = env.reset() for _ in range(100): # time.sleep(1. / 60.) obs, r, done, _ = env.step(ctrl.get_action(obs)) # print(' ', cost_func(obs)) if done: print("done:", r, obs) time.sleep(1) ctrl.reset() obs = env.reset() ipdb.set_trace()
def train( env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, clip_param=0.2, entcoeff=0.0, gamma=0.99, lam=0.95, optim_epochs=10, optim_batchsize=64, schedule='linear', optim_stepsize=3e-4, timesteps_per_actorbatch=1000, BEHAVIORAL_CLONING=True, PPO=True, ): start = time.time() logz.configure_output_dir(logdir) print("-------- env info --------") print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING) print("PPO: ", PPO) print(" ") random_controller = RandomController(env) model_data_buffer = DataBuffer() ppo_data_buffer = DataBuffer_general(BC_BUFFER_SIZE, 6) bc_data_buffer = DataBuffer_general(BC_BUFFER_SIZE, 2) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): model_data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) print("model data buffer size: ", model_data_buffer.size) normalization = compute_normalization(model_data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = MlpPolicy_bc(sess=sess, env=env, hid_size=64, num_hid_layers=2, clip_param=clip_param, entcoeff=entcoeff) bc_net = BCnetwork(sess, env, BATCH_SIZE_BC, learning_rate) mpc_controller_bc_ppo = MPCcontroller_BC_PPO( env=env, dyn_model=dyn_model, bc_ppo_network=policy_nn, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) #======================================================== # # Prepare for rollouts # episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards max_timesteps = num_paths_onpol * env_horizon for itr in range(onpol_iters): print("onpol_iters: ", itr) dyn_model.fit(model_data_buffer) if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) # saver.save(sess, CHECKPOINT_DIR) behavioral_cloning_eval(sess, env, policy_nn, env_horizon) ppo_data_buffer.clear() seg = traj_segment_generator(policy_nn, mpc_controller, mpc_controller_bc_ppo, bc_data_buffer, env, env_horizon) add_vtarg_and_adv(seg, gamma, lam) ob, ac, rew, nxt_ob, atarg, tdlamret = seg["ob"], seg["ac"], seg[ "rew"], seg["nxt_ob"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate for n in range(len(ob)): ppo_data_buffer.add( (ob[n], ac[n], rew[n], nxt_ob[n], atarg[n], tdlamret[n])) bc_data_buffer.add((ob[n], ac[n])) model_data_buffer.add(ob[n], ac[n], nxt_ob[n]) print("ppo_data_buffer size", ppo_data_buffer.size) print("bc_data_buffer size", bc_data_buffer.size) print("model data buffer size: ", model_data_buffer.size) # optim_batchsize = optim_batchsize or ob.shape[0] # behavioral_cloning(sess, env, bc_net, mpc_controller, env_horizon, bc_data_buffer, Training_epoch=1000) if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy policy_nn.assign_old_eq_new( ) # set old parameter values to new parameter values for op_ep in range(optim_epochs): # losses = [] # list of tuples, each of which gives the loss for a minibatch # for i in range(int(timesteps_per_actorbatch/optim_batchsize)): if PPO: sample_ob_no, sample_ac_na, sample_rew, sample_nxt_ob_no, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample( optim_batchsize) newlosses = policy_nn.lossandupdate_ppo( sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, optim_stepsize * cur_lrmult) # losses.append(newlosses) if BEHAVIORAL_CLONING: sample_ob_no, sample_ac_na = bc_data_buffer.sample( optim_batchsize) policy_nn.update_bc(sample_ob_no, sample_ac_na, optim_stepsize * cur_lrmult) if op_ep % 100 == 0: print('epcho: ', op_ep) behavioral_cloning_eval(sess, env, policy_nn, env_horizon) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 ep_lengths = seg["ep_lens"] returns = seg["ep_rets"] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", iters_so_far) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.dump_tabular() logz.pickle_tf_vars()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, clip_param=0.2 , entcoeff=0.0, gamma=0.99, lam=0.95, optim_epochs=10, optim_batchsize=64, schedule='linear', bc_lr=1e-3, ppo_lr=3e-4, timesteps_per_actorbatch=1000, MPC = True, BEHAVIORAL_CLONING = True, PPO = True, ): start = time.time() print("-------- env info --------") print("Environment: ", FLAGS.env_name) print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print("action_space low: ", env.action_space.low) print("action_space high: ", env.action_space.high) print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING) print("PPO: ", PPO) print("MPC-AUG: ", MPC) print(" ") random_controller = RandomController(env) # Creat buffers model_data_buffer = DataBufferGeneral(FLAGS.MODELBUFFER_SIZE, 5) ppo_data_buffer = DataBufferGeneral(10000, 4) bc_data_buffer = DataBufferGeneral(2000, 2) # Random sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): model_data_buffer.add([path['observations'][n], path['actions'][n], path['rewards'][n], path['next_observations'][n], path['next_observations'][n] - path['observations'][n]]) print("model data buffer size: ", model_data_buffer.size) normalization = compute_normalization(model_data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # # tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config) policy_nn = MlpPolicy(sess=sess, env=env, hid_size=128, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff) if FLAGS.LEARN_REWARD: print("Learn reward function") dyn_model = NNDynamicsRewardModel(env=env, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_ppo_controller = MPCcontrollerPolicyNetReward(env=env, dyn_model=dyn_model, explore=FLAGS.MPC_EXP, policy_net=policy_nn, self_exp=FLAGS.SELFEXP, horizon=mpc_horizon, num_simulated_paths=num_simulated_paths) else: print("Use predefined cost function") dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_ppo_controller = MPCcontrollerPolicyNet(env=env, dyn_model=dyn_model, explore=FLAGS.MPC_EXP, policy_net=policy_nn, self_exp=FLAGS.SELFEXP, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) # if not PPO: # mpc_ppo_controller = mpc_controller #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(FLAGS.model_path) print("checkpoint", checkpoint) if checkpoint and checkpoint.model_checkpoint_path and FLAGS.LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(FLAGS.model_path): os.mkdir(FLAGS.model_path) #======================================================== # # Prepare for rollouts # tstart = time.time() states_true = [] states_predict = [] rewards_true = [] rewards_predict = [] ob = env.reset() ob_pre = np.expand_dims(ob, axis=0) states_true.append(ob) states_predict.append(ob_pre) for step in range(100): # ac = env.action_space.sample() # not used, just so we have the datatype ac, _ = policy_nn.act(ob, stochastic=True) ob, rew, done, _ = env.step(ac) ob_pre, r_pre = dyn_model.predict(ob_pre, ac) states_true.append(ob) rewards_true.append(rew) states_predict.append(ob_pre) rewards_predict.append(r_pre[0][0]) states_true = np.asarray(states_true) states_predict = np.asarray(states_predict) states_predict = np.squeeze(states_predict, axis=1) rewards_true = np.asarray(rewards_true) rewards_predict = np.asarray(rewards_predict) print("states_true", states_true.shape) print("states_predict", states_predict.shape) print("rewards_true", rewards_true.shape) print("rewards_predict", rewards_predict.shape) np.savetxt('./data/eval_model/states_true.out', states_true, delimiter=',') np.savetxt('./data/eval_model/states_predict.out', states_predict, delimiter=',') np.savetxt('./data/eval_model/rewards_true.out', rewards_true, delimiter=',') np.savetxt('./data/eval_model/rewards_predict.out', rewards_predict, delimiter=',')
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} print(params) # the three lines below are to override the functions passed in, which aren't serializable params["activation"] = "relu" params["cost_fn"] = "cheetah_cost_fn" params["env"] = "HalfCheetahEnvNew" logz.save_params(params) returns_file = "returns.csv" returns_array = [] #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ data = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then # taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in # https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): """ YOUR CODE HERE """ print(itr) # learn/fit dynamics model using the Adam optimization algorithm l = dyn_model.fit(data) print(l) # sample a set of on-policy trajectories from the environment new_data = sample(env, mpc_controller, num_paths=num_paths_onpol, horizon=env_horizon, render=render, verbose=False) # append transition to dataset data += new_data # compute costs costs = np.array([path_cost(cost_fn, path) for path in new_data]) print(costs) # compute returns returns = np.array( [new_data[i]["returns"] for i in range(len(new_data))]) print(returns) returns_array.append(returns) np.array(returns_array).dump(returns_file) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None ): # tracker = SummaryTracker() """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. """ YOUR CODE HERE """ # Print env info print("-------- env info --------") print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print(" ") random_controller = RandomController(env) data_buffer = DataBuffer() bc_data_buffer = DataBuffer_SA(BC_BUFFER_SIZE) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # print("data buffer size: ", data_buffer.size) normalization = compute_normalization(data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) bc_net = BCnetwork(sess, env, BATCH_SIZE_BC, learning_rate) mpc_controller_bc = MPCcontroller_BC(env=env, dyn_model=dyn_model, bc_network=bc_net, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): """ YOUR CODE HERE """ print("onpol_iters: ", itr) dyn_model.fit(data_buffer) saver.save(sess, CHECKPOINT_DIR) returns = [] costs = [] for w in range(num_paths_onpol): print("paths_onpol: ", w, " running.....") print("data buffer size: ", data_buffer.size) st = env.reset_model() path = {'observations': [], 'actions': [], 'next_observations':[]} # tracker.print_diff() return_ = 0 for i in range(env_horizon): if render: env.render() # print("env_horizon: ", i) if BEHAVIORAL_CLONING: if bc_data_buffer.size > 2000: at = mpc_controller_bc.get_action(st) else: at = mpc_controller.get_action(st) else: at = mpc_controller.get_action(st) # at = random_controller.get_action(st) st_next, env_reward, _, _ = env._step(at) path['observations'].append(st) path['actions'].append(at) path['next_observations'].append(st_next) st = st_next return_ += env_reward # cost & return cost = path_cost(cost_fn, path) costs.append(cost) returns.append(return_) print("total return: ", return_) print("costs: ", cost) # add into buffers for n in range(len(path['observations'])): data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) bc_data_buffer.add(path['observations'][n], path['actions'][n]) if BEHAVIORAL_CLONING: behavioral_cloning(sess, env, bc_net, mpc_controller, env_horizon, bc_data_buffer, Training_epoch=1000) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # logz.log_tabular('Average_BC_Return', np.mean(bc_returns)) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} del params['cost_fn'] del params['activation'] del params['output_activation'] del params['env'] logz.save_params(params) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ # Sample from random controller paths = sample(env, random_controller, num_paths_random, env_horizon, render, True) # Build data set data = dict() data['observations'] = np.concatenate( [path['observations'] for path in paths]) data['actions'] = np.concatenate([path['actions'] for path in paths]) next_observations = np.concatenate( [path['next_observations'] for path in paths]) data['deltas'] = next_observations - data['observations'] #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): """ YOUR CODE HERE """ # Refit dynamic model dyn_model.fit(data) # Sample on-policy trajectories paths = sample(env, mpc_controller, num_paths_onpol, env_horizon, render, True) # Summarize trajectories costs = [path_cost(cost_fn, path) for path in paths] returns = [np.sum(path['rewards']) for path in paths] # Aggregate data onpol_observations = np.concatenate( [path['observations'] for path in paths]) onpol_actions = np.concatenate([path['actions'] for path in paths]) onpol_next_observations = np.concatenate( [path['next_observations'] for path in paths]) onpol_deltas = onpol_next_observations - onpol_observations data['observations'] = np.append(data['observations'], onpol_observations, 0) data['actions'] = np.append(data['actions'], onpol_actions, 0) data['deltas'] = np.append(data['deltas'], onpol_deltas, 0) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=1, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=1, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=100, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) paths, rewards, costs = sample(env, random_controller, num_paths_random) obs = np.concatenate([path["observations"] for path in paths]) acs = np.concatenate([path["actions"] for path in paths]) n_obs = np.concatenate([path["next_observations"] for path in paths]) delta = n_obs - obs data = {'observations': obs, 'actions': acs, 'delta': delta} #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # mean_obs, std_obs, mean_deltas, std_deltas, mean_actions, std_actions = compute_normalization( data) normalization = dict() normalization['observations'] = [mean_obs, std_obs] normalization['actions'] = [mean_actions, std_actions] normalization['delta'] = [mean_deltas, std_deltas] #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # print("onpol_iter", onpol_iters) for itr in range(onpol_iters): """ YOUR CODE HERE """ print(data['observations'].shape) #print(data['observations'].shape) dyn_model.fit(data) # Generate trajectories from MPC controllers pathsM, returns, costs = sample(env, mpc_controller, num_paths_onpol) obs = np.concatenate([path["observations"] for path in pathsM]) acs = np.concatenate([path["actions"] for path in pathsM]) n_obs = np.concatenate([path["next_observations"] for path in pathsM]) delta = n_obs - obs data = { 'observations': np.concatenate((data['observations'], obs)), 'actions': np.concatenate((data['actions'], acs)), 'delta': np.concatenate((data['delta'], delta)) } # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train( env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=1000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, ): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ paths_rand = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=render, verbose=False) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(paths_rand) gamma = 0.99 #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # # prefit dynamic before on policy dagger: print("****** Pretrain dynamic Model *******") losses = [] obs_rand = np.concatenate([path["observation"] for path in paths_rand]) action_rand = np.concatenate([path["action"] for path in paths_rand]) next_ob_rand = np.concatenate([path["obs_next"] for path in paths_rand]) data_size_rand = obs_rand.shape[0] for i in range(1000): # obtain batch size from random policy batch_idx_rand = np.random.randint(data_size_rand, size=batch_size) batch_ob_rand = obs_rand[batch_idx_rand, :] batch_ac_rand = action_rand[batch_idx_rand, :] batch_nxt_rand = next_ob_rand[batch_idx_rand, :] # obtain batch size from on policy batch_ob = np.copy(batch_ob_rand) batch_ac = np.copy(batch_ac_rand) batch_nxt = np.copy(batch_nxt_rand) loss = dyn_model.fit(batch_ob, batch_ac, batch_nxt) losses.append(loss) if (i % 20 == 0): print('loss', loss) costs = [] returns = [] paths_rl = [] for itr in range(onpol_iters): """ YOUR CODE HERE """ # fit dynamic model if itr > 0: obs_rl = np.concatenate([path["observation"] for path in paths_rl]) action_rl = np.concatenate([path["action"] for path in paths_rl]) next_ob_rl = np.concatenate( [path["obs_next"] for path in paths_rl]) obs_rand = np.concatenate([path["observation"] for path in paths_rand]) action_rand = np.concatenate([path["action"] for path in paths_rand]) next_ob_rand = np.concatenate( [path["obs_next"] for path in paths_rand]) # print obs[128,:].shape data_size_rand = obs_rand.shape[0] if itr > 0: data_size_rl = obs_rl.shape[0] # batch_size=128 losses = [] # fit model function for i in range(dynamics_iters): # obtain batch size from random policy batch_idx_rand = np.random.randint(data_size_rand, size=batch_size / 20) batch_ob_rand = obs_rand[batch_idx_rand, :] batch_ac_rand = action_rand[batch_idx_rand, :] batch_nxt_rand = next_ob_rand[batch_idx_rand, :] # obtain batch size from on policy if itr > 0: batch_idx_rl = np.random.randint(data_size_rl, size=batch_size * 19 / 20) batch_ob_rl = obs_rl[batch_idx_rl, :] batch_ac_rl = action_rl[batch_idx_rl, :] batch_nxt_rl = next_ob_rl[batch_idx_rl, :] # mix them batch_ob = np.concatenate((batch_ob_rand, batch_ob_rl)) batch_ac = np.concatenate((batch_ac_rand, batch_ac_rl)) batch_nxt = np.concatenate((batch_nxt_rand, batch_nxt_rl)) else: batch_ob = np.copy(batch_ob_rand) batch_ac = np.copy(batch_ac_rand) batch_nxt = np.copy(batch_nxt_rand) loss = dyn_model.fit(batch_ob, batch_ac, batch_nxt) losses.append(loss) # if(i%20==0): # print('loss', loss) print("on policy dagger ", itr) ob = env.reset() observes, acs, rewards, obs_2, returns = [], [], [], [], [] steps = 0 g = 0 max_path_length = mpc_controller.horizon timesteps_this_batch = 0 while True: while True: observes.append(ob) ac = mpc_controller.get_action(ob) # print ac acs.append(ac) # print ac ob, rew, done, _ = env.step(ac) g += rew * gamma**steps obs_2.append(ob) rewards.append(rew) returns.append(g) steps += 1 if done or steps > max_path_length: terminated = done break path = { "observation": np.array(observes), "reward": np.array(rewards), "action": np.array(acs), "obs_next": np.array(obs_2), "return": np.array(returns) } paths_rl.append(path) timesteps_this_batch += pathlength(path) print g if timesteps_this_batch > batch_size: break trajectory_cost = trajectory_cost_fn(cheetah_cost_fn, path["observation"], path["action"], path["obs_next"]) costs.append(trajectory_cost) returns.append(path["return"][-1]) # print batch_ob.shape # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, load_model, model_path, logdir=None, render=False, learning_rate_dyn=1e-3, learning_rate_policy=1e-4, onpol_iters=10, dynamics_iters=60, policy_iters=100, batch_size=512, num_paths_random=10, num_paths_onpol=5, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, ): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ #logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ data = sample(env, random_controller, num_paths_random, env_horizon) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate_dyn, sess=sess) policy = NNPolicy(env=env, normalization=normalization, batch_size=batch_size, iterations=policy_iters, learning_rate=learning_rate_policy, sess=sess, model_path=model_path, save_path="./policy/", load_model=load_model) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) lqr_controller = LQRcontroller(env=env, delta=0.005, T=50, dyn_model=dyn_model, cost_fn=cost_fn, iterations=1) comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # # training the MPC controller as well as dynamics for itr in range(onpol_iters): print("fitting dynamics for worker ", rank) dyn_model.fit(data) print("sampling new trajectories from worker ", rank) new_data = sample(env, lqr_controller, num_paths_onpol, env_horizon) data += new_data comm.send(new_data, 0) if rank == 0: costs, returns = [], [] for path in data: costs.append(path_cost(cost_fn, path)) returns.append(np.sum(path['rewards'])) print("returns ",returns) for i in range(1, size): data += comm.recv(source=i) print("fitting policy...") policy.fit(data) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular() # applying the learned neural policy if rank == 0: ob = env.reset() while True: a = policy.get_action(ob.reshape((1, ob.shape[0]))) # control clipping to be added next_ob, reward, done, info = env.step(a[0]) print("action", a) print("predicted ob", dyn_model.predict(ob, a)) print("actual ob", (next_ob - normalization[0]) / (normalization[1] + 1e-10)) env.render() ob = next_ob if done: ob = env.reset()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation |_ for the loop to run. dynamics_iters Number of iterations of training for the | dynamics model which happen per iteration of |_ the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of | aggregation, using the Model Predictive Control |_ policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first | action of the best fictitious rollout. This | argument is how many timesteps should be in |_ each fictitious rollout. n_layers/size/activations Neural network architecture arguments. """ d("env = {}".format(env)) d("env.observation_space = {}".format(env.observation_space)) d("env.action_space = {}".format(env.action_space)) d("env.observation_space.shape = {}".format(env.observation_space.shape)) d("env.action_space.shape = {}".format(env.action_space.shape)) d("logdir = {}".format(logdir)) d("render = {}".format(render)) d("learning_rate = {}".format(learning_rate)) d("onpol_iters = {}".format(onpol_iters)) d("dynamics_iters = {}".format(dynamics_iters)) d("batch_size = {}".format(batch_size)) d("num_paths_random = {}".format(num_paths_random)) d("num_paths_onpol = {}".format(num_paths_onpol)) d("num_simulated_paths = {}".format(num_simulated_paths)) d("env_horizon = {}".format(env_horizon)) d("mpc_horizon = {}".format(mpc_horizon)) d("n_layers = {}".format(n_layers)) d("size = {}".format(size)) logz.configure_output_dir(logdir) #=========================================================================== # First, we need a lot of data generated by a random agent, with which # we'll begin to train our dynamics model. d("Generating random rollouts.") random_controller = RandomController(env) random_paths = sample(env=env, controller=random_controller, num_paths=num_paths_random, horizon=env_horizon, render=render) d("Done generating random rollouts.") #=========================================================================== # The random data will be used to get statistics (mean and std) for the # observations, actions, and deltas (where deltas are o_{t+1} - o_t). These # will be used for normalizing inputs and denormalizing outputs from the # dynamics network. d("Normalizing random rollouts.") data = paths_to_data(random_paths) normalization = compute_normalization(data) d("Done normalizing random rollouts.") mean_obs, std_obs, mean_deltas, std_deltas, mean_action, std_action = normalization d("mean_obs = {}".format(mean_obs)) d("std_obs = {}".format(std_obs)) d("mean_deltas = {}".format(mean_deltas)) d("std_deltas = {}".format(std_deltas)) d("mean_action = {}".format(mean_action)) d("std_action = {}".format(std_action)) #=========================================================================== # Build dynamics model and MPC controllers. sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #=========================================================================== # Tensorflow session building. sess.__enter__() tf.global_variables_initializer().run() #=========================================================================== # Take multiple iterations of onpolicy aggregation at each iteration # refitting the dynamics model to current dataset and then taking onpolicy # samples and aggregating to the dataset. # # Note: You don't need to use a mixing ratio in this assignment for new and # old data as described in https://arxiv.org/abs/1708.02596 start_time = time.time() for itr in range(onpol_iters): d("Iteration {}".format(itr)) # Shuffle data. d("Shuffling data.") shuffle_indexes = np.random.permutation(data["observations"].shape[0]) data["observations"] = data["observations"][shuffle_indexes] data["actions"] = data["actions"][shuffle_indexes] data["next_observations"] = data["next_observations"][shuffle_indexes] data["rewards"] = data["rewards"][shuffle_indexes] d("Done shuffling data.") # Fit the dynamics. d("Fitting dynamics.") dyn_model.fit(data) d("Done fitting dynamics.") # Generate on-policy rollouts. d("Generating on-policy rollouts.") rl_paths = sample(env=env, controller=mpc_controller, num_paths=num_paths_onpol, horizon=env_horizon, render=render) d("Done generating on-policy rollouts.") # Compute metrics. costs = np.array([path_cost(cost_fn, path) for path in rl_paths]) returns = np.array([sum(path["rewards"]) for path in rl_paths]) # Update data. new_data = paths_to_data(rl_paths) data = { "observations": np.concatenate([data["observations"], new_data["observations"]]), "actions": np.concatenate([data["actions"], new_data["actions"]]), "next_observations": np.concatenate( [data["next_observations"], new_data["next_observations"]]), "rewards": np.concatenate([data["rewards"], new_data["rewards"]]), } # TODO(mwhittaker): Shuffle if we need to. # LOGGING # Statistics for performance of MPC policy using our learned dynamics # model logz.log_tabular('Iteration', itr) logz.log_tabular('Time', time.time() - start_time) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory # using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=20, batch_size=64, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=500, mpc_horizon=15, n_layers=2, size=64, activation=tf.nn.relu, output_activation=None, controller_service=None, ): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) ref_controller = RefMPCController(env, lambda state: call_mpc(env, controller_service)) paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False, ) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # normalization = compute_normalization(paths) print(normalization) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): dyn_model.fit(paths) new_paths = sample(env,mpc_controller, num_paths=num_paths_onpol,horizon=env_horizon,render=False,verbose=False) costs = [] returns = [] for new_path in new_paths: cost = path_cost(cost_fn, new_path) costs.append(cost) returns.append(new_path['return']) costs = np.array(costs) returns = np.array(returns) paths = paths + new_paths # Aggregation # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(state_cb, pub_cmd, pub_act, rate, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. rand_controller = RandomController() paths = sample(state_cb, pub_cmd, pub_act, rate, rand_controller, num_paths_random, env_horizon, render) data = paths_to_array(paths) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): # Fit dynamics model print('Training dynamics model...') dyn_model.fit(data) plot_comparison(dyn_model, state_cb, pub_act, pub_cmd, rate) mpc_controller.dyn_model = dyn_model costs = [] returns = [] # Do MPC for i in range(num_paths_onpol): print('On policy path: %i' % i) obs_t, obs_tp1, acs_t, rews_t = [], [], [], [] s_t = state_cb.reset(pub_act, pub_cmd) total_return = 0 for j in range(env_horizon): # print('Timestep: %i, Return: %g' % (j,total_return)) a_t = mpc_controller.get_action(s_t) s_tp1, _ = state_cb.step(a_t, pub_act, pub_cmd) r_t = 0 for i in range(9): r_t += s_tp1[i * 12] - s_t[i * 12] total_return += r_t if render: env.render() time.sleep(0.05) obs_t.append(s_t) obs_tp1.append(s_tp1) acs_t.append(a_t) rews_t.append(r_t) s_t = s_tp1 path = { "observations": np.array(obs_t), "next_observations": np.array(obs_tp1), "actions": np.array(acs_t), "rewards": np.array(rews_t) } total_cost = path_cost(cost_fn, path) paths.append(path) returns.append(total_return) costs.append(total_cost) print('Total cost: %g, Total reward: %g' % (total_cost, total_return)) data = paths_to_array(paths) normalization = compute_normalization(data) # Set new normalization statistics for dynamics model dyn_model.normalization = normalization # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=10, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ paths = sample(env, random_controller, num_paths=50) first = 1 for path in paths: if (first): data = { "observations": path['observations'], "next_observations": path['next_observations'], "rewards": path['rewards'], "actions": path['actions'], "returns": path['returns'] } first = 0 else: data['observations'] = np.vstack( (data['observations'], path['observations'])) data['next_observations'] = np.vstack( (data['next_observations'], path['next_observations'])) data['rewards'] = np.vstack((data['rewards'], path['rewards'])) data['actions'] = np.vstack((data['actions'], path['actions'])) data['returns'] = np.vstack((data['returns'], path['returns'])) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # #open CSV csv_file = open('results.csv', 'w') writer = csv.writer(csv_file, delimiter=',') for itr in range(onpol_iters): print(itr) costs = [] returns = [] """ YOUR CODE HERE """ dyn_model.fit(data) #plot_comparison(env,dyn_model) mpc_controller.dyn_model = dyn_model #need to update or not? new_paths = sample(env, mpc_controller) for path in new_paths: cost = path_cost(cost_fn, path) costs.append(cost) returns.append(path['returns'][-1]) data['observations'] = np.vstack( (data['observations'], path['observations'])) data['next_observations'] = np.vstack( (data['next_observations'], path['next_observations'])) data['actions'] = np.vstack((data['actions'], path['actions'])) dyn_model.normalization = compute_normalization(data) writer.writerow([itr, np.mean(returns)]) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation='relu', output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) # ======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. """ YOUR CODE HERE """ random_controller = RandomController(env) paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, ignore_done=True) # 10 # ======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ # concatenate observations & actions to numpy data_rand_x # concatenate (next_observations -observations) to numpy data_rand_y for i in range(num_paths_random): if i == 0: data_rand_x = np.concatenate( (paths[i]['observations'], paths[i]['actions']), axis=1) data_rand_y = paths[i]['next_observations'] - paths[i][ 'observations'] else: x = np.concatenate((paths[i]['observations'], paths[i]['actions']), axis=1) data_rand_x = np.concatenate((data_rand_x, x), axis=0) y = paths[i]['next_observations'] - paths[i]['observations'] data_rand_y = np.concatenate((data_rand_y, y), axis=0) # Initialize data set D to Drand data_x = data_rand_x data_y = data_rand_y # ======================================================== # # Build dynamics model and MPC controllers. # # sess = tf.Session() # dyn_model = NNDynamicsModel(env=env, # n_layers=n_layers, # size=size, # activation=activation, # output_activation=output_activation, # batch_size=batch_size, # iterations=dynamics_iters, # learning_rate=learning_rate, # normalization=normalization # ) dyn_model = NNDynamicsModel( env=env, hidden_size=(500, 500), activation=activation, #'tanh' ).cuda() mpc_controller = MPCcontroller( env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths, ) # ======================================================== # # Tensorflow session building. # # sess.__enter__() # tf.global_variables_initializer().run() # ======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # # make dirs output if not (os.path.exists(logdir)): os.makedirs(logdir) path = os.path.join(logdir, 'model') if not (os.path.exists(path)): os.makedirs(path) for itr in range(onpol_iters): """ YOUR CODE HERE """ if itr != 0: dyn_model.load_state_dict(torch.load(path + '/net_params.pkl')) # store data # if (itr % 9) == 0 or itr == (onpol_iters-1): if itr >= 0: logger = Logger(logdir, csvname='log_orig' + str(itr)) data = np.concatenate((data_x, data_y), axis=1) logger.log_table2csv(data) if itr == 0: data_x += np.random.normal(0, 0.001, size=data_x.shape) data_y += np.random.normal(0, 0.001, size=data_y.shape) else: data_x = best_x + np.random.normal(0, 0.001, size=best_x.shape) data_y = best_y + np.random.normal(0, 0.001, size=best_y.shape) dyn_model.fit(data_x, data_y, epoch_size=dynamics_iters, batch_size=batch_size, test=True) torch.save(dyn_model.state_dict(), path + '/net_params.pkl') # save only the parameters torch.save(dyn_model, path + '/net' + str(itr) + '.pkl') # save entire net print('-------------Itr %d-------------' % itr) print('Start time:\n') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) start = time.time() # caculate run time --start time point # sample if Monitor is True: monitor_path = os.path.join(logdir, 'monitor' + str(itr)) env = wrappers.Monitor(env, monitor_path, force=True) paths = sample(env, mpc_controller, num_paths=num_paths_onpol, horizon=env_horizon, render=False, ignore_done=False, MPC=True) end = time.time() runtime2 = end - start print('runtime = ', runtime2) print('End time:\n') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) # concatenate observations & actions to numpy data_rand_x # concatenate (next_observations -observations) to numpy data_rand_y for i in range(num_paths_onpol): if i == 0: data_rl_x = np.concatenate( (paths[i]['observations'], paths[i]['actions']), axis=1) data_rl_y = paths[i]['next_observations'] - paths[i][ 'observations'] else: x = np.concatenate( (paths[i]['observations'], paths[i]['actions']), axis=1) data_rl_x = np.concatenate((data_rl_x, x), axis=0) y = paths[i]['next_observations'] - paths[i]['observations'] data_rl_y = np.concatenate((data_rl_y, y), axis=0) # Aggregate data data_x = np.concatenate((data_x, data_rl_x), axis=0) data_y = np.concatenate((data_y, data_rl_y), axis=0) costs = np.zeros((num_paths_onpol, 1)) returns = np.zeros((num_paths_onpol, 1)) for i in range(num_paths_onpol): costs[i] = paths[i]['cost'] returns[i] = paths[i]['returns'][0] if itr == 0: best_x = data_rl_x best_y = data_rl_y else: best_x = np.concatenate((best_x, data_rl_x), axis=0) best_y = np.concatenate((best_y, data_rl_y), axis=0) # store data #if (itr % 9) == 0 or itr == (onpol_iters-1): if itr >= 0: logger = Logger(logdir, csvname='best' + str(itr)) data = np.concatenate((best_x, best_y), axis=1) logger.log_table2csv(data) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train_PG( exp_name='', env_name='', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=False, animate=True, logdir=None, normalize_advantages=False, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, # mb mpc arguments model_learning_rate=1e-3, onpol_iters=10, dynamics_iters=260, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=1000, env_horizon=1000, mpc_horizon=10, m_n_layers=2, m_size=500, ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment # env = gym.make(env_name) env = HalfCheetahEnvNew() cost_fn = cheetah_cost_fn activation=tf.nn.relu output_activation=None # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes # max_path_length = max_path_length or env.spec.max_episode_steps max_path_length = max_path_length # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # Print environment infomation print("-------- env info --------") print("Environment name: ", env_name) print("Action space is discrete: ", discrete) print("Action space dim: ", ac_dim) print("Observation space dim: ", ob_dim) print("Max_path_length ", max_path_length) #========================================================================================# # Random data collection #========================================================================================# random_controller = RandomController(env) data_buffer_model = DataBuffer() data_buffer_ppo = DataBuffer_general(10000, 4) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): data_buffer_model.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) print("data buffer size: ", data_buffer_model.size) normalization = compute_normalization(data_buffer_model) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto() tf_config.allow_soft_placement = True tf_config.intra_op_parallelism_threads =4 tf_config.inter_op_parallelism_threads = 1 sess = tf.Session(config=tf_config) dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = policy_network_ppo(sess, ob_dim, ac_dim, discrete, n_layers, size, learning_rate) if nn_baseline: value_nn = value_network(sess, ob_dim, n_layers, size, learning_rate) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) if MPC: dyn_model.fit(data_buffer_model) returns = [] costs = [] # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: # print("data buffer size: ", data_buffer_model.size) current_path = {'observations': [], 'actions': [], 'reward': [], 'next_observations':[]} ob = env.reset() obs, acs, mpc_acs, rewards = [], [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 return_ = 0 while True: # print("steps ", steps) if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) if MPC: mpc_ac = mpc_controller.get_action(ob) else: mpc_ac = random_controller.get_action(ob) ac = policy_nn.predict(ob, mpc_ac) ac = ac[0] if not PG: ac = mpc_ac acs.append(ac) mpc_acs.append(mpc_ac) current_path['observations'].append(ob) ob, rew, done, _ = env.step(ac) current_path['reward'].append(rew) current_path['actions'].append(ac) current_path['next_observations'].append(ob) return_ += rew rewards.append(rew) steps += 1 if done or steps > max_path_length: break if MPC: # cost & return cost = path_cost(cost_fn, current_path) costs.append(cost) returns.append(return_) print("total return: ", return_) print("costs: ", cost) # add into buffers for n in range(len(current_path['observations'])): data_buffer_model.add(current_path['observations'][n], current_path['actions'][n], current_path['next_observations'][n]) for n in range(len(current_path['observations'])): data_buffer_ppo.add(current_path['observations'][n], current_path['actions'][n], current_path['reward'][n], current_path['next_observations'][n]) path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs), "mpc_action" : np.array(mpc_acs)} paths.append(path) timesteps_this_batch += pathlength(path) # print("timesteps_this_batch", timesteps_this_batch) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch print("data_buffer_ppo.size:", data_buffer_ppo.size) # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) mpc_ac_na = np.concatenate([path["mpc_action"] for path in paths]) # Computing Q-values if reward_to_go: q_n = [] for path in paths: for t in range(len(path["reward"])): t_ = 0 q = 0 while t_ < len(path["reward"]): if t_ >= t: q += gamma**(t_-t) * path["reward"][t_] t_ += 1 q_n.append(q) q_n = np.asarray(q_n) else: q_n = [] for path in paths: for t in range(len(path["reward"])): t_ = 0 q = 0 while t_ < len(path["reward"]): q += gamma**t_ * path["reward"][t_] t_ += 1 q_n.append(q) q_n = np.asarray(q_n) # Computing Baselines if nn_baseline: # b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no :ob_no}) b_n = value_nn.predict(ob_no) b_n = normalize(b_n) b_n = denormalize(b_n, np.std(q_n), np.mean(q_n)) adv_n = q_n - b_n else: adv_n = q_n.copy() # Advantage Normalization if normalize_advantages: adv_n = normalize(adv_n) # Optimizing Neural Network Baseline if nn_baseline: b_n_target = normalize(q_n) value_nn.fit(ob_no, b_n_target) # sess.run(baseline_update_op, feed_dict={sy_ob_no :ob_no, sy_baseline_target_n:b_n_target}) # Performing the Policy Update # policy_nn.fit(ob_no, ac_na, adv_n) policy_nn.fit(ob_no, ac_na, adv_n, mpc_ac_na) # sess.run(update_op, feed_dict={sy_ob_no :ob_no, sy_ac_na:ac_na, sy_adv_n:adv_n}) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate_dyn, sess=sess) dyn_model = NNPolicy(env=env, normalization=normalization, batch_size=batch_size, iterations=policy_iters, learning_rate=learning_rate_policy, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
def train(env, logdir=None, render=False, learning_rate=1e-3, dagger_iters=10, dynamics_iters=60, batch_size=512, num_random_rollouts=10, num_onpol_rollouts=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, n_hid_units=500, activation=tf.nn.relu, output_activation=None): """ Arguments: dagger_iters Number of iterations of onpolicy aggregation for the loop to run. dyn_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_random_rollouts Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_onpol_rollouts Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/n_hid_units/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) paths = sample(env, random_controller, num_rollouts=num_random_rollouts, horizon=env_horizon) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. normalization_stats = compute_normalization_stats(paths) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, n_hid_units=n_hid_units, activation=activation, output_activation=output_activation, normalization_stats=normalization_stats, batch_size=batch_size, num_iter=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation # at each iteration refitting the dynamics model to current dataset # and then taking on-policy samples and aggregating to the dataset. # # Note: You don't need to use a mixing ratio in this assignment # for new and old data as described in https://arxiv.org/abs/1708.02596 # for i in range(dagger_iters): print('********** ITERATION {}/{} ************'.format( i + 1, dagger_iters)) # Fitting dynamics model dyn_model.fit(paths) # Sampling on-policy new_paths = sample(env, mpc_controller, num_rollouts=num_onpol_rollouts, horizon=env_horizon) paths = new_paths + random.sample( paths, len(new_paths) // 9) # Adding new paths and forgetting old ones # paths += new_paths returns = [sum(path['rewards']) for path in new_paths] costs = [path_cost(path) for path in new_paths] # LOGGING # Statistics for performance of MPC policy using our learned dynamics model # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()