def train(state_cb, pub_cmd, pub_act, rate, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. rand_controller = RandomController() paths = sample(state_cb, pub_cmd, pub_act, rate, rand_controller, num_paths_random, env_horizon, render) data = paths_to_array(paths) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): # Fit dynamics model print('Training dynamics model...') dyn_model.fit(data) plot_comparison(dyn_model, state_cb, pub_act, pub_cmd, rate) mpc_controller.dyn_model = dyn_model costs = [] returns = [] # Do MPC for i in range(num_paths_onpol): print('On policy path: %i' % i) obs_t, obs_tp1, acs_t, rews_t = [], [], [], [] s_t = state_cb.reset(pub_act, pub_cmd) total_return = 0 for j in range(env_horizon): # print('Timestep: %i, Return: %g' % (j,total_return)) a_t = mpc_controller.get_action(s_t) s_tp1, _ = state_cb.step(a_t, pub_act, pub_cmd) r_t = 0 for i in range(9): r_t += s_tp1[i * 12] - s_t[i * 12] total_return += r_t if render: env.render() time.sleep(0.05) obs_t.append(s_t) obs_tp1.append(s_tp1) acs_t.append(a_t) rews_t.append(r_t) s_t = s_tp1 path = { "observations": np.array(obs_t), "next_observations": np.array(obs_tp1), "actions": np.array(acs_t), "rewards": np.array(rews_t) } total_cost = path_cost(cost_fn, path) paths.append(path) returns.append(total_return) costs.append(total_cost) print('Total cost: %g, Total reward: %g' % (total_cost, total_return)) data = paths_to_array(paths) normalization = compute_normalization(data) # Set new normalization statistics for dynamics model dyn_model.normalization = normalization # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train_PG( exp_name='', env_name='', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=False, animate=True, logdir=None, normalize_advantages=False, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, # mb mpc arguments model_learning_rate=1e-3, onpol_iters=10, dynamics_iters=260, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=1000, env_horizon=1000, mpc_horizon=10, m_n_layers=2, m_size=500, ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment # env = gym.make(env_name) env = HalfCheetahEnvNew() cost_fn = cheetah_cost_fn activation=tf.nn.relu output_activation=None # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes # max_path_length = max_path_length or env.spec.max_episode_steps max_path_length = max_path_length # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # Print environment infomation print("-------- env info --------") print("Environment name: ", env_name) print("Action space is discrete: ", discrete) print("Action space dim: ", ac_dim) print("Observation space dim: ", ob_dim) print("Max_path_length ", max_path_length) #========================================================================================# # Random data collection #========================================================================================# random_controller = RandomController(env) data_buffer_model = DataBuffer() data_buffer_ppo = DataBuffer_general(10000, 4) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): data_buffer_model.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) print("data buffer size: ", data_buffer_model.size) normalization = compute_normalization(data_buffer_model) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto() tf_config.allow_soft_placement = True tf_config.intra_op_parallelism_threads =4 tf_config.inter_op_parallelism_threads = 1 sess = tf.Session(config=tf_config) dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = policy_network_ppo(sess, ob_dim, ac_dim, discrete, n_layers, size, learning_rate) if nn_baseline: value_nn = value_network(sess, ob_dim, n_layers, size, learning_rate) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) if MPC: dyn_model.fit(data_buffer_model) returns = [] costs = [] # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: # print("data buffer size: ", data_buffer_model.size) current_path = {'observations': [], 'actions': [], 'reward': [], 'next_observations':[]} ob = env.reset() obs, acs, mpc_acs, rewards = [], [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 return_ = 0 while True: # print("steps ", steps) if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) if MPC: mpc_ac = mpc_controller.get_action(ob) else: mpc_ac = random_controller.get_action(ob) ac = policy_nn.predict(ob, mpc_ac) ac = ac[0] if not PG: ac = mpc_ac acs.append(ac) mpc_acs.append(mpc_ac) current_path['observations'].append(ob) ob, rew, done, _ = env.step(ac) current_path['reward'].append(rew) current_path['actions'].append(ac) current_path['next_observations'].append(ob) return_ += rew rewards.append(rew) steps += 1 if done or steps > max_path_length: break if MPC: # cost & return cost = path_cost(cost_fn, current_path) costs.append(cost) returns.append(return_) print("total return: ", return_) print("costs: ", cost) # add into buffers for n in range(len(current_path['observations'])): data_buffer_model.add(current_path['observations'][n], current_path['actions'][n], current_path['next_observations'][n]) for n in range(len(current_path['observations'])): data_buffer_ppo.add(current_path['observations'][n], current_path['actions'][n], current_path['reward'][n], current_path['next_observations'][n]) path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs), "mpc_action" : np.array(mpc_acs)} paths.append(path) timesteps_this_batch += pathlength(path) # print("timesteps_this_batch", timesteps_this_batch) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch print("data_buffer_ppo.size:", data_buffer_ppo.size) # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) mpc_ac_na = np.concatenate([path["mpc_action"] for path in paths]) # Computing Q-values if reward_to_go: q_n = [] for path in paths: for t in range(len(path["reward"])): t_ = 0 q = 0 while t_ < len(path["reward"]): if t_ >= t: q += gamma**(t_-t) * path["reward"][t_] t_ += 1 q_n.append(q) q_n = np.asarray(q_n) else: q_n = [] for path in paths: for t in range(len(path["reward"])): t_ = 0 q = 0 while t_ < len(path["reward"]): q += gamma**t_ * path["reward"][t_] t_ += 1 q_n.append(q) q_n = np.asarray(q_n) # Computing Baselines if nn_baseline: # b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no :ob_no}) b_n = value_nn.predict(ob_no) b_n = normalize(b_n) b_n = denormalize(b_n, np.std(q_n), np.mean(q_n)) adv_n = q_n - b_n else: adv_n = q_n.copy() # Advantage Normalization if normalize_advantages: adv_n = normalize(adv_n) # Optimizing Neural Network Baseline if nn_baseline: b_n_target = normalize(q_n) value_nn.fit(ob_no, b_n_target) # sess.run(baseline_update_op, feed_dict={sy_ob_no :ob_no, sy_baseline_target_n:b_n_target}) # Performing the Policy Update # policy_nn.fit(ob_no, ac_na, adv_n) policy_nn.fit(ob_no, ac_na, adv_n, mpc_ac_na) # sess.run(update_op, feed_dict={sy_ob_no :ob_no, sy_ac_na:ac_na, sy_adv_n:adv_n}) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None ): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ # sample random paths data = sample(env, random_controller, num_paths_random, env_horizon) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): """ YOUR CODE HERE """ dyn_model.fit(data) paths = [] # iteration over paths for n_path in range(num_paths_onpol): ob = env.reset() obs, n_obs, acs, rewards = [], [], [], [] for n_horz in range(env_horizon): obs.append(ob) ac = mpc_controller.get_action(ob) acs.append(ac) ob, rew, done, _ = env.step(ac) n_obs.append(ob) rewards.append(rew) cost = trajectory_cost_fn(cost_fn, np.array(obs), np.array(acs), np.array(n_obs)) path = {"observations" : np.array(obs), "next_observations" : np.array(n_obs), "rewards" : np.array(rewards), "actions" : np.array(acs), "costs" : cost} paths.append(path) data.append(path) returns = [path["rewards"].sum() for path in paths] costs = [path["costs"] for path in paths] # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None ): # tracker = SummaryTracker() """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. """ YOUR CODE HERE """ # Print env info print("-------- env info --------") print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print(" ") random_controller = RandomController(env) data_buffer = DataBuffer() bc_data_buffer = DataBuffer_SA(BC_BUFFER_SIZE) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # print("data buffer size: ", data_buffer.size) normalization = compute_normalization(data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) bc_net = BCnetwork(sess, env, BATCH_SIZE_BC, learning_rate) mpc_controller_bc = MPCcontroller_BC(env=env, dyn_model=dyn_model, bc_network=bc_net, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): """ YOUR CODE HERE """ print("onpol_iters: ", itr) dyn_model.fit(data_buffer) saver.save(sess, CHECKPOINT_DIR) returns = [] costs = [] for w in range(num_paths_onpol): print("paths_onpol: ", w, " running.....") print("data buffer size: ", data_buffer.size) st = env.reset_model() path = {'observations': [], 'actions': [], 'next_observations':[]} # tracker.print_diff() return_ = 0 for i in range(env_horizon): if render: env.render() # print("env_horizon: ", i) if BEHAVIORAL_CLONING: if bc_data_buffer.size > 2000: at = mpc_controller_bc.get_action(st) else: at = mpc_controller.get_action(st) else: at = mpc_controller.get_action(st) # at = random_controller.get_action(st) st_next, env_reward, _, _ = env._step(at) path['observations'].append(st) path['actions'].append(at) path['next_observations'].append(st_next) st = st_next return_ += env_reward # cost & return cost = path_cost(cost_fn, path) costs.append(cost) returns.append(return_) print("total return: ", return_) print("costs: ", cost) # add into buffers for n in range(len(path['observations'])): data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) bc_data_buffer.add(path['observations'][n], path['actions'][n]) if BEHAVIORAL_CLONING: behavioral_cloning(sess, env, bc_net, mpc_controller, env_horizon, bc_data_buffer, Training_epoch=1000) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # logz.log_tabular('Average_BC_Return', np.mean(bc_returns)) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train( env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=1000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, ): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ paths_rand = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=render, verbose=False) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(paths_rand) gamma = 0.99 #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # # prefit dynamic before on policy dagger: print("****** Pretrain dynamic Model *******") losses = [] obs_rand = np.concatenate([path["observation"] for path in paths_rand]) action_rand = np.concatenate([path["action"] for path in paths_rand]) next_ob_rand = np.concatenate([path["obs_next"] for path in paths_rand]) data_size_rand = obs_rand.shape[0] for i in range(1000): # obtain batch size from random policy batch_idx_rand = np.random.randint(data_size_rand, size=batch_size) batch_ob_rand = obs_rand[batch_idx_rand, :] batch_ac_rand = action_rand[batch_idx_rand, :] batch_nxt_rand = next_ob_rand[batch_idx_rand, :] # obtain batch size from on policy batch_ob = np.copy(batch_ob_rand) batch_ac = np.copy(batch_ac_rand) batch_nxt = np.copy(batch_nxt_rand) loss = dyn_model.fit(batch_ob, batch_ac, batch_nxt) losses.append(loss) if (i % 20 == 0): print('loss', loss) costs = [] returns = [] paths_rl = [] for itr in range(onpol_iters): """ YOUR CODE HERE """ # fit dynamic model if itr > 0: obs_rl = np.concatenate([path["observation"] for path in paths_rl]) action_rl = np.concatenate([path["action"] for path in paths_rl]) next_ob_rl = np.concatenate( [path["obs_next"] for path in paths_rl]) obs_rand = np.concatenate([path["observation"] for path in paths_rand]) action_rand = np.concatenate([path["action"] for path in paths_rand]) next_ob_rand = np.concatenate( [path["obs_next"] for path in paths_rand]) # print obs[128,:].shape data_size_rand = obs_rand.shape[0] if itr > 0: data_size_rl = obs_rl.shape[0] # batch_size=128 losses = [] # fit model function for i in range(dynamics_iters): # obtain batch size from random policy batch_idx_rand = np.random.randint(data_size_rand, size=batch_size / 20) batch_ob_rand = obs_rand[batch_idx_rand, :] batch_ac_rand = action_rand[batch_idx_rand, :] batch_nxt_rand = next_ob_rand[batch_idx_rand, :] # obtain batch size from on policy if itr > 0: batch_idx_rl = np.random.randint(data_size_rl, size=batch_size * 19 / 20) batch_ob_rl = obs_rl[batch_idx_rl, :] batch_ac_rl = action_rl[batch_idx_rl, :] batch_nxt_rl = next_ob_rl[batch_idx_rl, :] # mix them batch_ob = np.concatenate((batch_ob_rand, batch_ob_rl)) batch_ac = np.concatenate((batch_ac_rand, batch_ac_rl)) batch_nxt = np.concatenate((batch_nxt_rand, batch_nxt_rl)) else: batch_ob = np.copy(batch_ob_rand) batch_ac = np.copy(batch_ac_rand) batch_nxt = np.copy(batch_nxt_rand) loss = dyn_model.fit(batch_ob, batch_ac, batch_nxt) losses.append(loss) # if(i%20==0): # print('loss', loss) print("on policy dagger ", itr) ob = env.reset() observes, acs, rewards, obs_2, returns = [], [], [], [], [] steps = 0 g = 0 max_path_length = mpc_controller.horizon timesteps_this_batch = 0 while True: while True: observes.append(ob) ac = mpc_controller.get_action(ob) # print ac acs.append(ac) # print ac ob, rew, done, _ = env.step(ac) g += rew * gamma**steps obs_2.append(ob) rewards.append(rew) returns.append(g) steps += 1 if done or steps > max_path_length: terminated = done break path = { "observation": np.array(observes), "reward": np.array(rewards), "action": np.array(acs), "obs_next": np.array(obs_2), "return": np.array(returns) } paths_rl.append(path) timesteps_this_batch += pathlength(path) print g if timesteps_this_batch > batch_size: break trajectory_cost = trajectory_cost_fn(cheetah_cost_fn, path["observation"], path["action"], path["obs_next"]) costs.append(trajectory_cost) returns.append(path["return"][-1]) # print batch_ob.shape # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def main(): nb_total_steps = 1000 nb_iterations = 40 hidden_layers = [256, 256] writer = tensorboardX.SummaryWriter() args = parse_args(__doc__, ['env']) env = gym.make(args.env) ctrl = rand_ctrl = RandomController(env) # ipdb.set_trace() print('#inputs : %d' % ctrl.nb_inputs()) print('#actions: %d' % ctrl.nb_actions()) # f_net = make_net( # [ctrl.nb_inputs() + ctrl.nb_actions()] + hidden_layers + [ctrl.nb_inputs()], # [nn.ReLU() for _ in hidden_layers], # ) f_net = MOENetwork( nb_inputs=ctrl.nb_inputs() + ctrl.nb_actions(), nb_experts=4, gait_layers=[64], expert_layers=[64, ctrl.nb_inputs()], ) data = collect_data(env, ctrl, nb_total_steps*10) # ipdb.set_trace() dynamics = DynamicsModel(env, f_net, data.get_all(), writer=writer) # cost_func = lambda s,a,sn: -sn[3].item() # refers to vx cost_func = get_cost(args.env) # refers to vx # data.calc_normalizations() # dynamics.fit(data) mpc_ctrl = MPCcontroller(env, dynamics.predict, cost_func, num_simulated_paths=100, horizon=10, num_mpc_steps=10) eval_args = EvaluationArgs(nb_burnin_steps=4, nb_episodes=10, horizons=[1, 2, 4, 8, 16, 32]) for i in range(nb_iterations): print('Iteration', i) new_data = collect_data(env, ctrl, nb_total_steps) dynamics.fit(*new_data.get_all()) data.extend(new_data) dynamics.fit(*data.sample(sample_size=4*nb_total_steps)) evaluate_and_log_dynamics( dynamics.predict, env, rand_ctrl, writer=writer, i_step=i, args=eval_args ) evaluate_and_log_dynamics( dynamics.predict, env, mpc_ctrl, writer=writer, i_step=i, args=eval_args ) # dynamics.fit(*data.get_all()) if random.random() > 0.5: ctrl = rand_ctrl else: ctrl = mpc_ctrl env = gym.make(args.env) ctrl = MPCcontroller(env, dynamics.predict, cost_func, num_simulated_paths=1000, num_mpc_steps=4) # TODO env.render(mode='human') obs = env.reset() for _ in range(100): # time.sleep(1. / 60.) obs, r, done, _ = env.step(ctrl.get_action(obs)) # print(' ', cost_func(obs)) if done: print("done:", r, obs) time.sleep(1) ctrl.reset() obs = env.reset() ipdb.set_trace()