def run_pca(policy): max_traj_len = 1000 from sklearn.decomposition import PCA with torch.no_grad(): env = env_factory(False)() state = env.reset() done = False timesteps = 0 eval_reward = 0 if hasattr(policy, 'init_hidden_state'): policy.init_hidden_state() mems = [] while not done and timesteps < max_traj_len: env.speed = 0.5 action = policy.forward(torch.Tensor(state)).numpy() state, reward, done, _ = env.step(action) env.render() eval_reward += reward timesteps += 1 memory = get_hiddens(policy) mems.append(memory) data = np.vstack(mems) pca = PCA(n_components=2) fig = plt.figure() plt.axis('off') base = (0.05, 0.05, 0.05) components = pca.fit_transform(data) x = components[:, 0] y = components[:, 1] c = [] for i in range(len(x)): c.append(np.hstack([base, (len(x) - i / 2) / len(x)])) plt.scatter(x, y, color=c, s=0.8) plt.show() plt.close()
def collect_point(policy, max_traj_len): """ A helper function which collects a single memory-dynamics parameter pair from a trajectory. """ env = env_factory(True)() chosen_timestep = np.random.randint(15, max_traj_len) timesteps = 0 done = False if hasattr(policy, 'init_hidden_state'): policy.init_hidden_state() state = env.reset() while not done and timesteps < chosen_timestep: action = policy(state).numpy() state, _, done, _ = env.step(action) timesteps += 1 return get_hiddens( policy), env.get_damping(), env.get_mass(), env.get_ipos()
parser.add_argument("--report", default=False, action="store_true", help="Whether to report stats or not") args = parser.parse_args() run_args = pickle.load(open(os.path.join(args.path, "experiment.pkl"), "rb")) # Make mirror False so that env_factory returns a regular wrap env function and not a symmetric env function that can be called to return # a cassie environment (symmetric env cannot be called to make another env) if hasattr(run_args, 'simrate'): env_fn = env_factory(run_args.env_name, traj=run_args.traj, simrate=run_args.simrate, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, mirror=False, clock_based=run_args.clock_based, reward=run_args.reward, history=run_args.history) else: env_fn = env_factory(run_args.env_name, traj=run_args.traj, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, mirror=False, clock_based=run_args.clock_based, reward=run_args.reward, history=run_args.history) cassie_env = env_fn()
def run_experiment(args): """ The entry point for the dynamics extraction algorithm. """ from util import create_logger locale.setlocale(locale.LC_ALL, '') policy = torch.load(args.policy) env_fn = env_factory(True) layers = [int(x) for x in args.layers.split(',')] env = env_fn() policy.init_hidden_state() policy(torch.tensor(env.reset()).float()) latent_dim = get_hiddens(policy).shape[0] models = [] opts = [] for fn in [env.get_damping, env.get_mass, env.get_ipos]: output_dim = fn().shape[0] model = Model(latent_dim, output_dim, layers=layers) models += [model] opts += [optim.Adam(model.parameters(), lr=args.lr, eps=1e-5)] model.policy_path = args.policy logger = create_logger(args) best_loss = None actor_dir = os.path.split(args.policy)[0] create_new = True if os.path.exists(os.path.join(logger.dir, 'test_latents.pt')): x = torch.load(os.path.join(logger.dir, 'train_latents.pt')) test_x = torch.load(os.path.join(logger.dir, 'test_latents.pt')) damps = torch.load(os.path.join(logger.dir, 'train_damps.pt')) test_damps = torch.load(os.path.join(logger.dir, 'test_damps.pt')) masses = torch.load(os.path.join(logger.dir, 'train_masses.pt')) test_masses = torch.load(os.path.join(logger.dir, 'test_masses.pt')) ipos = torch.load(os.path.join(logger.dir, 'train_ipos.pt')) test_ipos = torch.load(os.path.join(logger.dir, 'test_ipos.pt')) if args.points > len(x) + len(test_x): create_new = True else: create_new = False if create_new: if not ray.is_initialized(): ray.init(num_cpus=args.workers) print("Collecting {:4d} timesteps of data.".format(args.points)) points_per_worker = max(args.points // args.workers, 1) start = time.time() damps, masses, ipos, x = concat( ray.get([ collect_data.remote(policy, points=points_per_worker) for _ in range(args.workers) ])) split = int(0.8 * len(x)) test_x = x[split:] x = x[:split] test_damps = damps[split:] damps = damps[:split] test_masses = masses[split:] masses = masses[:split] test_ipos = ipos[split:] ipos = ipos[:split] print( "{:3.2f} to collect {} timesteps. Training set is {}, test set is {}" .format(time.time() - start, len(x) + len(test_x), len(x), len(test_x))) torch.save(x, os.path.join(logger.dir, 'train_latents.pt')) torch.save(test_x, os.path.join(logger.dir, 'test_latents.pt')) torch.save(damps, os.path.join(logger.dir, 'train_damps.pt')) torch.save(test_damps, os.path.join(logger.dir, 'test_damps.pt')) torch.save(masses, os.path.join(logger.dir, 'train_masses.pt')) torch.save(test_masses, os.path.join(logger.dir, 'test_masses.pt')) torch.save(ipos, os.path.join(logger.dir, 'train_ipos.pt')) torch.save(test_ipos, os.path.join(logger.dir, 'test_ipos.pt')) for epoch in range(args.epochs): random_indices = SubsetRandomSampler(range(len(x) - 1)) sampler = BatchSampler(random_indices, args.batch_size, drop_last=False) for j, batch_idx in enumerate(sampler): batch_x = x[batch_idx] #.float() batch = [damps[batch_idx], masses[batch_idx], ipos[batch_idx]] losses = [] for model, batch_y, opt in zip(models, batch, opts): loss = 0.5 * (batch_y - model(batch_x)).pow(2).mean() opt.zero_grad() loss.backward() opt.step() losses.append(loss.item()) print("Epoch {:3d} batch {:4d}/{:4d} ".format( epoch, j, len(sampler) - 1), end='\r') train_y = [damps, masses, ipos] test_y = [test_damps, test_masses, test_ipos] order = ['damping', 'mass', 'com'] with torch.no_grad(): print("\nEpoch {:3d} losses:".format(epoch)) for model, y_tr, y_te, name in zip(models, train_y, test_y, order): loss_total = 0.5 * (y_tr - model(x)).pow(2).mean().item() preds = model(test_x) test_loss = 0.5 * (y_te - preds).pow(2).mean().item() pce = torch.mean(torch.abs((y_te - preds) / (y_te + 1e-5))) err = torch.mean(torch.abs((y_te - preds))) logger.add_scalar(logger.arg_hash + '/' + name + '_loss', test_loss, epoch) logger.add_scalar(logger.arg_hash + '/' + name + '_percenterr', pce, epoch) logger.add_scalar(logger.arg_hash + '/' + name + '_abserr', err, epoch) model.dyn_parameter = name torch.save(model, os.path.join(logger.dir, name + '_extractor.pt')) print( "\t{:16s}: train loss {:7.6f} test loss {:7.6f}, err {:5.4f}, percent err {:3.2f}" .format(name, loss_total, test_loss, err, pce))
def run_experiment(args): torch.set_num_threads(1) from util import create_logger, env_factory, eval_policy, train_normalizer from nn.critic import FF_V, LSTM_V from nn.actor import FF_Stochastic_Actor, LSTM_Stochastic_Actor import locale, os locale.setlocale(locale.LC_ALL, '') # wrapper function for creating parallelized envs env_fn = env_factory(args.randomize) obs_dim = env_fn().observation_space.shape[0] action_dim = env_fn().action_space.shape[0] layers = [int(x) for x in args.layers.split(',')] # Set seeds torch.manual_seed(args.seed) np.random.seed(args.seed) if args.recurrent: policy = LSTM_Stochastic_Actor(obs_dim, action_dim,\ layers=layers, dynamics_randomization=args.randomize, fixed_std=torch.ones(action_dim)*args.std) critic = LSTM_V(obs_dim, layers=layers) else: policy = FF_Stochastic_Actor(obs_dim, action_dim,\ layers=layers, dynamics_randomization=args.randomize, fixed_std=torch.ones(action_dim)*args.std) critic = FF_V(obs_dim, layers=layers) env = env_fn() policy.train(0) critic.train(0) print("Collecting normalization statistics with {} states...".format(args.prenormalize_steps)) train_normalizer(policy, args.prenormalize_steps, max_traj_len=args.traj_len, noise=1) critic.copy_normalizer_stats(policy) algo = PPO(policy, critic, env_fn, args) # create a tensorboard logging object if not args.nolog: logger = create_logger(args) else: logger = None if args.save_actor is None and logger is not None: args.save_actor = os.path.join(logger.dir, 'actor.pt') if args.save_critic is None and logger is not None: args.save_critic = os.path.join(logger.dir, 'critic.pt') print() print("Proximal Policy Optimization:") print("\tseed: {}".format(args.seed)) print("\ttimesteps: {:n}".format(int(args.timesteps))) print("\titeration steps: {:n}".format(int(args.sample))) print("\tprenormalize steps: {}".format(int(args.prenormalize_steps))) print("\ttraj_len: {}".format(args.traj_len)) print("\tdiscount: {}".format(args.discount)) print("\tactor_lr: {}".format(args.a_lr)) print("\tcritic_lr: {}".format(args.c_lr)) print("\tgrad clip: {}".format(args.grad_clip)) print("\tbatch size: {}".format(args.batch_size)) print("\tepochs: {}".format(args.epochs)) print("\trecurrent: {}".format(args.recurrent)) print("\tdynamics rand: {}".format(args.randomize)) print("\tworkers: {}".format(args.workers)) print() itr = 0 timesteps = 0 best_reward = None while timesteps < args.timesteps: kl, a_loss, c_loss, steps = algo.do_iteration(args.sample, args.traj_len, args.epochs, batch_size=args.batch_size, kl_thresh=args.kl) eval_reward = eval_policy(algo.actor, env, episodes=5, max_traj_len=args.traj_len, verbose=False, visualize=False) timesteps += steps print("iter {:4d} | return: {:5.2f} | KL {:5.4f} | timesteps {:n}".format(itr, eval_reward, kl, timesteps)) if best_reward is None or eval_reward > best_reward: print("\t(best policy so far! saving to {})".format(args.save_actor)) best_reward = eval_reward if args.save_actor is not None: torch.save(algo.actor, args.save_actor) if args.save_critic is not None: torch.save(algo.critic, args.save_critic) if logger is not None: logger.add_scalar('cassie/kl', kl, itr) logger.add_scalar('cassie/return', eval_reward, itr) logger.add_scalar('cassie/actor_loss', a_loss, itr) logger.add_scalar('cassie/critic_loss', c_loss, itr) itr += 1 print("Finished ({} of {}).".format(timesteps, args.timesteps))
def collect_data(): seeds = [0, 10, 20] wait_cycles = 3 num_cycles = 10 speed = 1.0 # Make envs aslip_path = "./trained_models/comparison/aslip_delta_policies/traj-aslip_aslip_old_2048_12288_seed-{}".format( seeds[0]) iros_path = "./trained_models/comparison/iros_retrain_policies/clock_traj-walking_iros_paper_2048_12288_seed-{}".format( seeds[0]) aslip_args = pickle.load( open(os.path.join(aslip_path, "experiment.pkl"), "rb")) aslip_env_fn = env_factory(aslip_args.env_name, traj=aslip_args.traj, state_est=aslip_args.state_est, no_delta=aslip_args.no_delta, dynamics_randomization=aslip_args.dyn_random, mirror=False, clock_based=aslip_args.clock_based, reward="iros_paper", history=aslip_args.history) aslip_env = aslip_env_fn() iros_args = pickle.load( open(os.path.join(iros_path, "experiment.pkl"), "rb")) iros_env_fn = env_factory(iros_args.env_name, traj=iros_args.traj, state_est=iros_args.state_est, no_delta=iros_args.no_delta, dynamics_randomization=iros_args.dyn_random, mirror=False, clock_based=iros_args.clock_based, reward="iros_paper", history=iros_args.history) iros_env = iros_env_fn() aslip_state = torch.Tensor(aslip_env.reset_for_test()) iros_state = torch.Tensor(iros_env.reset_for_test()) aslip_env.update_speed(speed) iros_env.speed = speed aslip_phaselen = aslip_env.phaselen + 1 iros_phaselen = iros_env.phaselen + 1 aslip_data = np.zeros( (len(seeds), num_cycles, aslip_env.simrate * (aslip_phaselen), 2)) iros_data = np.zeros( (len(seeds), num_cycles, iros_env.simrate * (iros_phaselen), 2)) for s in range(len(seeds)): print("running seed {}".format(seeds[s])) aslip_path = "./trained_models/comparison/aslip_delta_policies/traj-aslip_aslip_old_2048_12288_seed-{}".format( seeds[s]) iros_path = "./trained_models/comparison/iros_retrain_policies/clock_traj-walking_iros_paper_2048_12288_seed-{}".format( seeds[s]) # Load policies aslip_policy = torch.load(os.path.join(aslip_path, "actor.pt")) iros_policy = torch.load(os.path.join(iros_path, "actor.pt")) # print("iros: ", iros_env.simrate, iros_env.phaselen) # print("aslip: ", aslip_env.simrate, aslip_env.phaselen) with torch.no_grad(): # Run few cycles to stabilize (do separate incase two envs have diff phaselens) for i in range(wait_cycles * (aslip_phaselen)): action = aslip_policy.forward( torch.Tensor(aslip_state), deterministic=True).detach().numpy() aslip_state, reward, done, _ = aslip_env.step(action) aslip_state = torch.Tensor(aslip_state) # curr_qpos = aslip_env.sim.qpos() # print("curr height: ", curr_qpos[2]) for i in range(wait_cycles * (iros_phaselen)): action = iros_policy.forward( torch.Tensor(iros_state), deterministic=True).detach().numpy() iros_state, reward, done, _ = iros_env.step(action) iros_state = torch.Tensor(iros_state) # Collect actual data print("Start actual data") for i in range(num_cycles): for j in range(aslip_phaselen): action = aslip_policy.forward( torch.Tensor(aslip_state), deterministic=True).detach().numpy() for k in range(aslip_env.simrate): aslip_env.step_simulation(action) aslip_data[s, i, j * aslip_env.simrate + k, :] = aslip_env.sim.get_foot_forces() aslip_env.time += 1 aslip_env.phase += aslip_env.phase_add if aslip_env.phase > aslip_env.phaselen: aslip_env.phase = 0 aslip_env.counter += 1 aslip_state = aslip_env.get_full_state() for i in range(num_cycles): for j in range(iros_phaselen): action = iros_policy.forward( torch.Tensor(iros_state), deterministic=True).detach().numpy() for k in range(iros_env.simrate): iros_env.step_simulation(action) iros_data[s, i, j * iros_env.simrate + k, :] = iros_env.sim.get_foot_forces() iros_env.time += 1 iros_env.phase += iros_env.phase_add if iros_env.phase > iros_env.phaselen: iros_env.phase = 0 iros_env.counter += 1 iros_state = iros_env.get_full_state() np.save( "./trained_models/comparison/aslip_delta_policies/avg_GRFs_speed{}". format(speed), aslip_data) np.save( "./trained_models/comparison/iros_retrain_policies/avg_GRFs_speed{}". format(speed), iros_data)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--num_procs", default=20, type=int) parser.add_argument("--traj_len", default=400, type=int) parser.add_argument("--num_steps", default=10, type=int) parser.add_argument("--num_trials", default=5, type=int) parser.add_argument("--debug", default=False, action='store_true') parser.add_argument("--no_vis", default=False, action='store_true') parser.add_argument("--eval", default=True, action="store_false", help="Whether to call policy.eval() or not") args = parser.parse_args() ray.init(num_cpus=args.num_procs) paths = [ "./trained_models/ppo/Cassie-v0/traj-aslip_aslip_old_2048_12288_seed-0/", "./trained_models/ppo/Cassie-v0/traj-aslip_aslip_old_2048_12288_seed-10/", "./trained_models/ppo/Cassie-v0/traj-aslip_aslip_old_2048_12288_seed-20/", "./trained_models/ppo/Cassie-v0/traj-aslip_aslip_old_2048_12288_seed-30/" ] data = [] ideal_foot_poses = [] actual_foot_poses = [] placement_errors = [] for path in paths: # Get policy, create env constructor run_args = pickle.load(open(path + "experiment.pkl", "rb")) policy = torch.load(path + "actor.pt") env_fn = env_factory("Cassie-v0", traj="aslip", state_est=run_args.state_est, no_delta=run_args.no_delta, learn_gains=run_args.learn_gains, ik_baseline=run_args.ik_baseline, dynamics_randomization=run_args.dyn_random, clock_based=run_args.clock_based, reward="aslip_old", history=run_args.history) # parallelized loop for speed data_id = [ footstep_test.remote(policy, env_fn, speed, args.num_trials, args.num_steps) for speed in speeds ] foo = ray.get(data_id) data.append(foo) data = np.array(data) print(data.shape) with open('data.pkl', 'wb') as f: pickle.dump(data, f)