def evaluate_policy(env, policy, step, L, num_episodes, num_eval_timesteps, video_dir=None, metric=None, show=False): returns = [] start = time.time() for i in range(num_episodes): # print(f"Eval episode: {i}...") # video = VideoRecorder(env, enabled=video_dir is not None and i == 0) s = 0 states = [] actions = [] obs = env.reset() done = False total_reward = 0 while (not done) and (s < num_eval_timesteps): with torch.no_grad(): with eval_mode(policy): action = policy.select_action(obs) action_scale = env.action_space.high * (action + 1) / 2 obs, reward, done, _ = env.step(action_scale) states.append(obs) actions.append(action_scale) if metric is not None: reward = metric(obs, action) # video.record() total_reward += reward s += 1 returns.append(total_reward) #/len(states)) if show: plot_rollout(states, actions, pry=[1, 0, 2]) else: plot_rollout(states, actions, pry=[1, 0, 2], save=True, loc=f"/{str(step)}") end = time.time() print(f"Rollout in {end - start} s, logged {len(states)}") L.info(f" - - Evaluated, mean reward {np.mean(returns)}, n={num_episodes}") return returns
def basic_rollout(self, s0, i_model, plot=False): # log.info(f"Running rollout from Euler angles Y:{s0[2]}, P:{s0[0]}, R:{s0[1]}, ") state_log = [] action_log = [] max_len = self.b_cfg.max_length cur_action, update = self.policy.get_action(s0) state_log.append(s0) action_log.append(cur_action) next_state, logvars = smart_model_step(i_model, s0, cur_action) state = push_history(next_state, s0) cost = 0 for k in range(max_len): # print(f"Itr {k}") # print(f"Action {cur_action.tolist()}") # print(f"State {next_state.tolist()}") cur_action, update = self.policy.get_action(next_state) state_log.append(state) action_log.append(cur_action) next_state, logvars = smart_model_step(i_model, state, cur_action) state = push_history(next_state, state) # print(f"logvars {logvars}") # weight = 0 if k < 5 else 1 if k == (max_len - 1): weight = self.t_c else: weight = self.l_c / max_len # cost += weight * get_reward_euler(next_state, cur_action) cost += get_reward_euler(next_state, cur_action, pry=self.cfg.pid.params.pry) if plot: plot_rollout(state_log, np.stack(action_log).squeeze(), pry=[self.cfg.pid.params.pry]) return cost / self.norm_cost, [state_log, action_log] # / max_len # cost
def mpc(cfg): log.info("============= Configuration =============") log.info(f"Config:\n{cfg.pretty()}") log.info("=========================================") env_name = cfg.env.params.name env = gym.make(env_name) env.reset() full_rewards = [] if cfg.metric.name == 'Living': metric = living_reward elif cfg.metric.name == 'Rotation': metric = rotation_mat elif cfg.metric.name == 'Square': metric = squ_cost elif cfg.metric.name == 'Yaw': metric = yaw_r else: raise ValueError("Improper metric name passed") for s in range(cfg.experiment.seeds): log.info(f"Random Seed: {s}") total_costs = [] data_rand = [] total_steps = [] r = 0 while r < cfg.experiment.random: data_r = rollout(env, RandomController(env, cfg), cfg.experiment, metric=metric) plot_rollout(data_r[0], data_r[1], pry=cfg.pid.params.pry, save=cfg.save, loc=f"/R_{r}") rews = data_r[-2] sim_error = data_r[-1] if sim_error: print("Repeating strange simulation") continue # rand_costs.append(np.sum(rews) / len(rews)) # for minimization total_costs.append(np.sum(rews)) # for minimization # log.info(f" - Cost {np.sum(rews) / cfg.experiment.r_len}") r += 1 # data_sample = subsample(data_r, cfg.policy.params.period) data_rand.append(data_r) total_steps.append(0) X, dX, U = to_XUdX(data_r) X, dX, U = combine_data(data_rand[:-1], (X, dX, U)) msg = "Random Rollouts completed of " msg += f"Mean Cumulative reward {np.mean(total_costs)}, " msg += f"Mean Flight length {cfg.policy.params.period * np.mean([np.shape(d[0])[0] for d in data_rand])}" log.info(msg) trial_log = dict( env_name=cfg.env.params.name, model=None, seed=cfg.random_seed, raw_data=data_rand, trial_num=-1, rewards=total_costs, steps=total_steps, nll=None, ) save_log(cfg, -1, trial_log) model, train_log = train_model(X, U, dX, cfg.model) for i in range(cfg.experiment.num_roll-cfg.experiment.random): controller = MPController(env, model, cfg) r = 0 cum_costs = [] data_rs = [] while r < cfg.experiment.repeat: data_r = rollout(env, controller, cfg.experiment, metric=metric) plot_rollout(data_r[0], data_r[1], pry=cfg.pid.params.pry, save=cfg.save, loc=f"/{str(i)}_{r}") rews = data_r[-2] sim_error = data_r[-1] if sim_error: print("Repeating strange simulation") continue # cum_costs.append(np.sum(rews) / len(rews)) # for minimization total_costs.append(np.sum(rews)) # for minimization # log.info(f" - Cost {np.sum(rews) / cfg.experiment.r_len}") r += 1 # data_sample = subsample(data_r, cfg.policy.params.period) data_rs.append(data_r) total_steps.append(np.shape(X)[0]) X, dX, U = combine_data(data_rs, (X, dX, U)) msg = "Rollouts completed of " msg += f"Mean Cumulative reward {np.mean(total_costs)}, " #/ cfg.experiment.r_len msg += f"Mean Flight length {cfg.policy.params.period * np.mean([np.shape(d[0])[0] for d in data_rs])}" log.info(msg) trial_log = dict( env_name=cfg.env.params.name, model=model, seed=cfg.random_seed, raw_data=data_rs, trial_num=i, rewards=total_costs, steps=total_steps, nll=train_log, ) save_log(cfg, i, trial_log) model, train_log = train_model(X, U, dX, cfg.model) fig = plot_rewards_over_trials(np.transpose(np.stack([total_costs])), env_name, save=True) fig.write_image(os.getcwd() + "/learning-curve.pdf")
def mpc(cfg): log.info("============= Configuration =============") log.info(f"Config:\n{cfg.pretty()}") log.info("=========================================") # plot_results_yaw(pts=cfg.data) # quit() env_name = cfg.env.params.name env = gym.make(env_name) env.reset() env.seed(cfg.random_seed, inertial=cfg.experiment.inertial) if cfg.experiment.inertial: log.info( f"Running experiment with interial prop x:{env.Ixx}, y:{env.Iyy}") # full_rewards = [] # temp = hydra.utils.get_original_cwd() + '/outputs/2020-07-11/17-17-05/trial_3.dat' # dat = torch.load(temp) # actions = dat['raw_data'][0][1] # l = [] # # yaw_actions = np.array([ # [1500, 1500, 1500, 1500], # [2000, 1000, 1000, 2000], # [1000, 2000, 2000, 1000], # [2000, 2000, 1000, 1000], # [1000, 1000, 2000, 2000], # ]) # # def find_ind(arr): # if np.all(np.equal(arr, [1500, 1500, 1500, 1500])): # return 0 # elif np.all(np.equal(arr, [2000, 1000, 1000, 2000])): # return 1 # elif np.all(np.equal(arr, [1000, 2000, 2000, 1000])): # return 3 # elif np.all(np.equal(arr, [2000, 2000, 1000, 1000])): # return 2 # else: # [1000, 1000, 2000, 2000] # return 4 # # for act in actions: # act = act.numpy() # id = find_ind(act) # l.append(id) # # initial = l[:24] # states = dat['raw_data'][0][0][:25] # yaw_value = np.rad2deg(states[-1][0])-np.rad2deg(states[0][0]) # print(f"Yaw after 25 steps{yaw_value}") # plot_lie(initial) # # plot_rollout(np.stack(dat['raw_data'][0][0])[:500,:3], dat['raw_data'][0][1], loc="/yaw_plt", save=True, only_x=True, legend=False) # quit() if cfg.metric.name == 'Living': metric = living_reward log.info(f"Using metric living reward") elif cfg.metric.name == 'Rotation': metric = rotation_mat log.info(f"Using metric rotation matrix") elif cfg.metric.name == 'Square': metric = squ_cost log.info(f"Using metric square cost") elif cfg.metric.name == 'Yaw': metric = yaw_r log.info(f"Using metric yaw sliding mode") elif cfg.metric.name == 'Yaw2': metric = yaw_r2 log.info(f"Using metric yaw base") elif cfg.metric.name == 'Yaw3': metric = yaw_r3 log.info(f"Using metric yaw rate") else: raise ValueError("Improper metric name passed") for s in range(cfg.experiment.seeds): log.info(f"Random Seed: {s}") total_costs = [] data_rand = [] total_steps = [] r = 0 while r < cfg.experiment.random: data_r = rollout(env, RandomController(env, cfg), cfg.experiment, metric=metric) if env_name != 'CartPoleContEnv-v0': plot_rollout(data_r[0], data_r[1], pry=cfg.pid.params.pry, save=cfg.save, loc=f"/R_{r}") rews = data_r[-2] sim_error = data_r[-1] if sim_error: print("Repeating strange simulation") continue # rand_costs.append(np.sum(rews) / len(rews)) # for minimization total_costs.append(np.sum(rews)) # for minimization # log.info(f" - Cost {np.sum(rews) / cfg.experiment.r_len}") r += 1 # data_sample = subsample(data_r, cfg.policy.params.period) data_rand.append(data_r) total_steps.append(0) X, dX, U = to_XUdX(data_r) X, dX, U = combine_data(data_rand[:-1], (X, dX, U)) msg = "Random Rollouts completed of " msg += f"Mean Cumulative reward {np.mean(total_costs)}, " msg += f"Mean length {np.mean([len(a[0]) for a in data_rand])}" log.info(msg) last_yaw = np.max(np.abs(np.stack(data_r[0])[:, 2])) #data_r[0][-1][2] trial_log = dict( env_name=cfg.env.params.name, # model=model, seed=cfg.random_seed, raw_data=data_r, # yaw_num=last_yaw, trial_num=-1, rewards=total_costs, steps=total_steps, # nll=train_log, ) save_log(cfg, -1, trial_log) model, train_log = train_model(X.squeeze(), U, dX.squeeze(), cfg.model) for i in range(cfg.experiment.num_roll - cfg.experiment.random): controller = MPController(env, model, cfg) r = 0 # cum_costs = [] data_rs = [] while r < cfg.experiment.repeat: data_r = rollout(env, controller, cfg.experiment, metric=metric) plot_rollout(data_r[0], data_r[1], pry=cfg.pid.params.pry, save=cfg.save, loc=f"/{str(i)}_{r}") rews = data_r[-2] sim_error = data_r[-1] if sim_error: print("Repeating strange simulation") continue # cum_costs.append(np.sum(rews) / len(rews)) # for minimization total_costs.append(np.sum(rews)) # for minimization # log.info(f" - Cost {np.sum(rews) / cfg.experiment.r_len}") r += 1 # data_sample = subsample(data_r, cfg.policy.params.period) data_rs.append(data_r) total_steps.append(np.shape(X)[0]) X, dX, U = combine_data(data_rs, (X, dX, U)) msg = "Rollouts completed of " msg += f"Cumulative reward {total_costs[-1]}, " # / cfg.experiment.r_len msg += f"length {len(data_r[0])}" # log.info(f"Final yaw {180*np.array(data_r[0][-1][2])/np.pi}") log.info(msg) last_yaw = np.max(np.abs(np.stack( data_r[0])[:, 2])) #data_r[0][-1][2] trial_log = dict( env_name=cfg.env.params.name, # model=model, seed=cfg.random_seed, raw_data=data_r, # yaw_num=last_yaw, trial_num=i, rewards=total_costs, steps=total_steps, nll=train_log, ) save_log(cfg, i, trial_log) model, train_log = train_model(X, U, dX, cfg.model) fig = plot_rewards_over_trials(np.transpose(np.stack([total_costs])), env_name, save=True) fig.write_image(os.getcwd() + "/learning-curve.pdf")