def plot_comparison(dyn_model, state_cb, pub_act, pub_cmd, rate): """ Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions. """ print('Plotting nn dynamics results') rand_cont = RandomController() s = state_cb.reset(pub_act, pub_cmd) env_state_traj = s model_state_traj = s steps = 100 for i in range(steps): a = rand_cont.get_action(None) # Step environment env_s, _ = state_cb.step(a, pub_act, pub_cmd) env_state_traj = np.vstack((env_state_traj, env_s)) # Step model if i == 0: model_s = dyn_model.predict(model_state_traj, a) else: model_s = dyn_model.predict(model_state_traj[i, :], a) model_state_traj = np.vstack((model_state_traj, model_s)) body = 10 # for i in range(body*12,(body+1)*12): for i in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 108, 109, 110]: plt.figure() env_state = plt.plot(np.arange(steps + 1), env_state_traj[:, i].reshape((steps + 1)), label='env state') model_state = plt.plot(np.arange(steps + 1), model_state_traj[:, i].reshape((steps + 1)), label='model state') state = i % 12 if state == 0: plt.title('Body ' + str(body) + ', x position') elif state == 1: plt.title('Body ' + str(body) + ', y position') elif state == 2: plt.title('Body ' + str(body) + ', z position') elif state == 3: plt.title('Body ' + str(body) + ', x angle') elif state == 4: plt.title('Body ' + str(body) + ', y angle') elif state == 5: plt.title('Body ' + str(body) + ', z angle') elif state == 6: plt.title('Body ' + str(body) + ', x velocty') elif state == 7: plt.title('Body ' + str(body) + ', y velocity') elif state == 8: plt.title('Body ' + str(body) + ', z velocity') elif state == 9: plt.title('Body ' + str(body) + ', x angular velocity') elif state == 10: plt.title('Body ' + str(body) + ', y angular velocity') elif state == 11: plt.title('Body ' + str(body) + ', z angular velocity') plt.legend() plt.draw() plt.show()
def plot_comparison(dyn_model, pub_act, pub_cmd, rate): """ Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions. """ rand_cont = RandomController() s = reset(pub_cmd, rate) env_state_traj = s model_state_traj = s steps = 100 for i in range(steps): a = rand_cont.get_action(None) # Step environment env_s, _ = step(a, pub_act, pub_cmd, rate) env_state_traj = np.vstack((env_state_traj, env_s)) # Step model if i == 0: model_s = dyn_model.predict(model_state_traj, a) else: model_s = dyn_model.predict(model_state_traj[i, :], a) model_state_traj = np.vstack((model_state_traj, model_s)) for i in range(len(s)): plt.figure() env_state = plt.plot(np.arange(steps + 1), env_state_traj[:, i].reshape((steps + 1)), label='env state') model_state = plt.plot(np.arange(steps + 1), model_state_traj[:, i].reshape((steps + 1)), label='model state') plt.title('State ' + str(i)) plt.legend() plt.draw() plt.show()
def plot_comparison(env, dyn_model): """ Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions. """ """ YOUR CODE HERE """ horizon = 100 ob = env.reset() pred = ob[np.newaxis, :] obs, next_obs, acs, rewards = [], [], [], [] preds = [] steps = 0 RC = RandomController(env) for _ in range(100): obs.append(ob) preds.append(pred) ac = RC.get_action(ob) acs.append(ac) ob, rew, done, _ = env.step(ac) pred = dyn_model.predict(pred, ac[np.newaxis, :]) next_obs.append(ob) rewards.append(rew) steps += 1 if done or steps > horizon: break path = { "observations": np.array(obs), "next_observations": np.array(next_obs), "rewards": np.array(rewards), "actions": np.array(acs), "predictions": np.array(preds) } print(path['observations'].shape) print(path['predictions'].shape) plt.plot(path['observations'][:, 0]) plt.plot(path['predictions'][:, 0, 0]) plt.show() pass
def train_PG( exp_name='', env_name='', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=False, animate=True, logdir=None, normalize_advantages=False, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, # mb mpc arguments model_learning_rate=1e-3, onpol_iters=10, dynamics_iters=260, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=1000, env_horizon=1000, mpc_horizon=10, m_n_layers=2, m_size=500, ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment # env = gym.make(env_name) env = HalfCheetahEnvNew() cost_fn = cheetah_cost_fn activation=tf.nn.relu output_activation=None # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes # max_path_length = max_path_length or env.spec.max_episode_steps max_path_length = max_path_length # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # Print environment infomation print("-------- env info --------") print("Environment name: ", env_name) print("Action space is discrete: ", discrete) print("Action space dim: ", ac_dim) print("Observation space dim: ", ob_dim) print("Max_path_length ", max_path_length) #========================================================================================# # Random data collection #========================================================================================# random_controller = RandomController(env) data_buffer_model = DataBuffer() data_buffer_ppo = DataBuffer_general(10000, 4) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): data_buffer_model.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) print("data buffer size: ", data_buffer_model.size) normalization = compute_normalization(data_buffer_model) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto() tf_config.allow_soft_placement = True tf_config.intra_op_parallelism_threads =4 tf_config.inter_op_parallelism_threads = 1 sess = tf.Session(config=tf_config) dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = policy_network_ppo(sess, ob_dim, ac_dim, discrete, n_layers, size, learning_rate) if nn_baseline: value_nn = value_network(sess, ob_dim, n_layers, size, learning_rate) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) if MPC: dyn_model.fit(data_buffer_model) returns = [] costs = [] # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: # print("data buffer size: ", data_buffer_model.size) current_path = {'observations': [], 'actions': [], 'reward': [], 'next_observations':[]} ob = env.reset() obs, acs, mpc_acs, rewards = [], [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 return_ = 0 while True: # print("steps ", steps) if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) if MPC: mpc_ac = mpc_controller.get_action(ob) else: mpc_ac = random_controller.get_action(ob) ac = policy_nn.predict(ob, mpc_ac) ac = ac[0] if not PG: ac = mpc_ac acs.append(ac) mpc_acs.append(mpc_ac) current_path['observations'].append(ob) ob, rew, done, _ = env.step(ac) current_path['reward'].append(rew) current_path['actions'].append(ac) current_path['next_observations'].append(ob) return_ += rew rewards.append(rew) steps += 1 if done or steps > max_path_length: break if MPC: # cost & return cost = path_cost(cost_fn, current_path) costs.append(cost) returns.append(return_) print("total return: ", return_) print("costs: ", cost) # add into buffers for n in range(len(current_path['observations'])): data_buffer_model.add(current_path['observations'][n], current_path['actions'][n], current_path['next_observations'][n]) for n in range(len(current_path['observations'])): data_buffer_ppo.add(current_path['observations'][n], current_path['actions'][n], current_path['reward'][n], current_path['next_observations'][n]) path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs), "mpc_action" : np.array(mpc_acs)} paths.append(path) timesteps_this_batch += pathlength(path) # print("timesteps_this_batch", timesteps_this_batch) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch print("data_buffer_ppo.size:", data_buffer_ppo.size) # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) mpc_ac_na = np.concatenate([path["mpc_action"] for path in paths]) # Computing Q-values if reward_to_go: q_n = [] for path in paths: for t in range(len(path["reward"])): t_ = 0 q = 0 while t_ < len(path["reward"]): if t_ >= t: q += gamma**(t_-t) * path["reward"][t_] t_ += 1 q_n.append(q) q_n = np.asarray(q_n) else: q_n = [] for path in paths: for t in range(len(path["reward"])): t_ = 0 q = 0 while t_ < len(path["reward"]): q += gamma**t_ * path["reward"][t_] t_ += 1 q_n.append(q) q_n = np.asarray(q_n) # Computing Baselines if nn_baseline: # b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no :ob_no}) b_n = value_nn.predict(ob_no) b_n = normalize(b_n) b_n = denormalize(b_n, np.std(q_n), np.mean(q_n)) adv_n = q_n - b_n else: adv_n = q_n.copy() # Advantage Normalization if normalize_advantages: adv_n = normalize(adv_n) # Optimizing Neural Network Baseline if nn_baseline: b_n_target = normalize(q_n) value_nn.fit(ob_no, b_n_target) # sess.run(baseline_update_op, feed_dict={sy_ob_no :ob_no, sy_baseline_target_n:b_n_target}) # Performing the Policy Update # policy_nn.fit(ob_no, ac_na, adv_n) policy_nn.fit(ob_no, ac_na, adv_n, mpc_ac_na) # sess.run(update_op, feed_dict={sy_ob_no :ob_no, sy_ac_na:ac_na, sy_adv_n:adv_n}) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()