def collect_point(policy, max_traj_len): """ A helper function which collects a single memory-dynamics parameter pair from a trajectory. """ legacy = 'legacy' if not (hasattr(policy, 'legacy') and policy.legacy == False) else '' env = env_factory(policy.env_name + legacy)() env.dynamics_randomization = True chosen_timestep = np.random.randint(15, max_traj_len) timesteps = 0 done = False if hasattr(policy, 'init_hidden_state'): policy.init_hidden_state() state = env.reset() while not done and timesteps < chosen_timestep: state = policy.normalize_state(state, update=True) action = policy(state).numpy() state, _, done, _ = env.step(action) timesteps += 1 return get_hiddens(policy), env.get_friction(), env.get_damping( ), env.get_mass(), env.get_quat()
def collect_data(actor, timesteps, max_traj_len, seed): """ A Ray remote function which collects information to be used when training QBNs. Specifically, it returns a list of states, actions, hidden states, and if the actor is an LSTM policy, cell states. """ torch.set_num_threads(1) np.random.seed(seed) policy = deepcopy(actor) layertype = policy.layers[0].__class__.__name__ with torch.no_grad(): # no gradients necessary env = env_factory(policy.env_name)() num_steps = 0 states = [] actions = [] hiddens = [] cells = [] while num_steps < timesteps: state = torch.as_tensor(env.reset()) done = False traj_len = 0 if hasattr(actor, 'init_hidden_state' ): # initialize the memory of the network if recurrent actor.init_hidden_state() while not done and traj_len < max_traj_len: state = torch.as_tensor(state).float() action = actor(state) hidden = actor.hidden[0].view(-1) if layertype == 'LSTMCell': cell = actor.cells[0].view(-1) # append the information we care about to lists to be returned. states.append(state.numpy()) actions.append(action.numpy()) hiddens.append(hidden.numpy()) if layertype == 'LSTMCell': cells.append(cell.numpy()) state, _, done, _ = env.step(action.numpy()) traj_len += 1 num_steps += 1 states = np.array(states) actions = np.array(actions) hiddens = np.array(hiddens) cells = np.array(cells) return [states, actions, hiddens, cells]
def run_experiment(args): torch.set_num_threads(1) from util.env import env_factory, train_normalizer from util.log import create_logger from policies.actor import FF_Actor, LSTM_Actor, GRU_Actor import locale, os locale.setlocale(locale.LC_ALL, '') # wrapper function for creating parallelized envs env_fn = env_factory(args.env) obs_dim = env_fn().observation_space.shape[0] action_dim = env_fn().action_space.shape[0] # Set seeds torch.manual_seed(args.seed) np.random.seed(args.seed) std = torch.ones(action_dim) * args.std layers = [int(x) for x in args.layers.split(',')] if args.arch.lower() == 'lstm': policy = LSTM_Actor(obs_dim, action_dim, env_name=args.env, layers=layers) elif args.arch.lower() == 'gru': policy = GRU_Actor(obs_dim, action_dim, env_name=args.env, layers=layers) elif args.arch.lower() == 'ff': policy = FF_Actor(obs_dim, action_dim, env_name=args.env, layers=layers) else: raise RuntimeError policy.legacy = False env = env_fn() print("Collecting normalization statistics with {} states...".format( args.prenormalize_steps)) train_normalizer(policy, args.prenormalize_steps, max_traj_len=args.traj_len, noise=1) pass
def run_udp(args): from util.env import env_factory policy = torch.load(args.policy) #policy.eval() env = env_factory(policy.env_name)() if not env.state_est: print( "This policy was not trained with state estimation and cannot be run on the robot." ) raise RuntimeError print("This policy is: {}".format(policy.__class__.__name__)) time.sleep(1) time_log = [] # time stamp input_log = [] # network inputs output_log = [] # network outputs state_log = [] # cassie state target_log = [] #PD target log clock_based = env.clock no_delta = env.no_delta u = pd_in_t() for i in range(5): u.leftLeg.motorPd.pGain[i] = env.P[i] u.leftLeg.motorPd.dGain[i] = env.D[i] u.rightLeg.motorPd.pGain[i] = env.P[i] u.rightLeg.motorPd.dGain[i] = env.D[i] if platform.node() == 'cassie': cassie = CassieUdp(remote_addr='10.10.10.3', remote_port='25010', local_addr='10.10.10.100', local_port='25011') else: cassie = CassieUdp() # local testing print('Connecting...') y = None while y is None: cassie.send_pd(pd_in_t()) time.sleep(0.001) y = cassie.recv_newest_pd() received_data = True t = time.monotonic() t0 = t print('Connected!\n') action = 0 # Whether or not STO has been TOGGLED (i.e. it does not count the initial STO condition) # STO = True means that STO is ON (i.e. robot is not running) and STO = False means that STO is # OFF (i.e. robot *is* running) sto = True sto_count = 0 orient_add = 0 # We have multiple modes of operation # 0: Normal operation, walking with policy # 1: Start up, Standing Pose with variable height (no balance) # 2: Stop Drop and hopefully not roll, Damping Mode with no P gain operation_mode = 0 standing_height = 0.7 MAX_HEIGHT = 0.8 MIN_HEIGHT = 0.4 D_mult = 1 # Reaaaaaally bad stability problems if this is pushed higher as a multiplier # Might be worth tuning by joint but something else if probably needed phase = 0 counter = 0 phase_add = 1 speed = 0 max_speed = 1 min_speed = -1 max_y_speed = 0.0 min_y_speed = 0.0 old_settings = termios.tcgetattr(sys.stdin) try: tty.setcbreak(sys.stdin.fileno()) while True: t = time.monotonic() tt = time.monotonic() - t0 # Get newest state state = cassie.recv_newest_pd() if state is None: print('Missed a cycle! ') continue if platform.node() == 'cassie': # Radio control orient_add -= state.radio.channel[1] / 60.0 # Reset orientation on STO if state.radio.channel[8] < 0: orient_add = quaternion2euler( state.pelvis.orientation[:])[2] # Save log files after STO toggle (skipping first STO) if sto is False: #log(sto_count) sto_count += 1 sto = True # Clear out logs time_log = [] # time stamp input_log = [] # network inputs output_log = [] # network outputs state_log = [] # cassie state target_log = [] #PD target log if hasattr(policy, 'init_hidden_state'): print("RESETTING HIDDEN STATES TO ZERO!") policy.init_hidden_state() else: sto = False if state.radio.channel[15] < 0 and hasattr( policy, 'init_hidden_state'): print("(TOGGLE SWITCH) RESETTING HIDDEN STATES TO ZERO!") policy.init_hidden_state() # Switch the operation mode based on the toggle next to STO if state.radio.channel[ 9] < -0.5: # towards operator means damping shutdown mode operation_mode = 2 elif state.radio.channel[ 9] > 0.5: # away from the operator means that standing pose operation_mode = 1 standing_height = MIN_HEIGHT + ( MAX_HEIGHT - MIN_HEIGHT) * 0.5 * (state.radio.channel[6] + 1) else: # Middle means normal walking operation_mode = 0 curr_max = max_speed / 2 speed_add = 0 #(max_speed / 2)# * state.radio.channel[4] speed = max(min_speed, state.radio.channel[0] * curr_max + speed_add) speed = min(max_speed, state.radio.channel[0] * curr_max + speed_add) print('\tCH4: ' + str(state.radio.channel[4] / 3), 'CH5: ' + str(state.radio.channel[5] / 3)) phase_add = 1 # + state.radio.channel[5] else: # Automatically change orientation and speed tt = time.monotonic() - t0 if check_stdin(): c = sys.stdin.read(1) if c == 'w': speed += 0.1 if c == 's': speed -= 0.1 if c == 'a': orient_add -= 0.1 if c == 'd': orient_add += 0.1 if c == 'r': speed = 0.5 orient_add = 0 speed = max(min_speed, speed) speed = min(max_speed, speed) #------------------------------- Normal Walking --------------------------- if operation_mode == 0 or operation_mode == 1: if operation_mode == 1: if hasattr(policy, 'init_hidden_state'): print("RESETTING HIDDEN STATES TO ZERO!") policy.init_hidden_state() #print("speed: {:3.2f} | orientation {:3.2f}".format(speed, orient_add), end='\r') print("\tspeed: {:3.2f} | orientation {:3.2f}".format( speed, orient_add)) # Reassign because it might have been changed by the damping mode for i in range(5): u.leftLeg.motorPd.pGain[i] = env.P[i] u.leftLeg.motorPd.dGain[i] = env.D[i] u.rightLeg.motorPd.pGain[i] = env.P[i] u.rightLeg.motorPd.dGain[i] = env.D[i] #print("EST PELVIS ORIENT: ", quaternion2euler(state.pelvis.orientation[:])) clock = [ np.sin(2 * np.pi * phase / 27), np.cos(2 * np.pi * phase / 27) ] #quaternion = euler2quat(z=orient_add, y=state.radio.channel[4]/3, x=state.radio.channel[5]/3) # Quat before bias modification quaternion = euler2quat(z=orient_add, y=0, x=0) iquaternion = inverse_quaternion(quaternion) new_orient = quaternion_product(iquaternion, state.pelvis.orientation[:]) # Adding bias to quat (ROLL PITCH YAW) #euler_orient = quaternion2euler(new_orient) + [state.radio.channel[4]/3, state.radio.channel[5]/3, 0] euler_orient = quaternion2euler(new_orient) + [0, -0.13, 0] new_orient = euler2quat(z=euler_orient[2], y=euler_orient[1], x=euler_orient[0]) if new_orient[0] < 0: new_orient = -new_orient new_translationalVelocity = rotate_by_quaternion( state.pelvis.translationalVelocity[:], iquaternion) ext_state = np.concatenate((clock, [speed])) robot_state = np.concatenate([ [state.pelvis.position[2] - state.terrain.height ], # pelvis height new_orient, # pelvis orientation state.motor.position[:], # actuated joint positions new_translationalVelocity[:], # pelvis translational velocity state.pelvis. rotationalVelocity[:], # pelvis rotational velocity state.motor.velocity[:], # actuated joint velocities state.pelvis. translationalAcceleration[:], # pelvis translational acceleration state.joint.position[:], # unactuated joint positions state.joint.velocity[:] # unactuated joint velocities ]) RL_state = np.concatenate([robot_state, ext_state]) #pretending the height is always 1.0 #RL_state[0] = 1.0 # Construct input vector torch_state = torch.Tensor(RL_state) torch_state = policy.normalize_state(torch_state, update=False) if no_delta: offset = env.offset else: offset = env.get_ref_state(phase=phase) action = policy(torch_state) env_action = action.data.numpy() target = env_action + offset # Send action for i in range(5): u.leftLeg.motorPd.pTarget[i] = target[i] u.rightLeg.motorPd.pTarget[i] = target[i + 5] cassie.send_pd(u) # Logging if sto == False: time_log.append(time.time()) state_log.append(state) input_log.append(RL_state) output_log.append(env_action) target_log.append(target) """ #------------------------------- Start Up Standing --------------------------- elif operation_mode == 1: print('Startup Standing. Height = ' + str(standing_height)) #Do nothing # Reassign with new multiplier on damping for i in range(5): u.leftLeg.motorPd.pGain[i] = 0.0 u.leftLeg.motorPd.dGain[i] = 0.0 u.rightLeg.motorPd.pGain[i] = 0.0 u.rightLeg.motorPd.dGain[i] = 0.0 # Send action for i in range(5): u.leftLeg.motorPd.pTarget[i] = 0.0 u.rightLeg.motorPd.pTarget[i] = 0.0 cassie.send_pd(u) """ #------------------------------- Shutdown Damping --------------------------- elif operation_mode == 2: print('Shutdown Damping. Multiplier = ' + str(D_mult)) # Reassign with new multiplier on damping for i in range(5): u.leftLeg.motorPd.pGain[i] = 0.0 u.leftLeg.motorPd.dGain[i] = D_mult * env.D[i] u.rightLeg.motorPd.pGain[i] = 0.0 u.rightLeg.motorPd.dGain[i] = D_mult * env.D[i] # Send action for i in range(5): u.leftLeg.motorPd.pTarget[i] = 0.0 u.rightLeg.motorPd.pTarget[i] = 0.0 cassie.send_pd(u) #---------------------------- Other, should not happen ----------------------- else: print('Error, In bad operation_mode with value: ' + str(operation_mode)) # Measure delay # Wait until next cycle time while time.monotonic() - t < 60 / 2000: time.sleep(0.001) print('\tdelay: {:6.1f} ms'.format((time.monotonic() - t) * 1000)) # Track phase phase += phase_add if phase >= 28: phase = 0 counter += 1 finally: termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
def run_experiment(args): torch.set_num_threads(1) from util.env import env_factory from util.log import create_logger from policies.critic import FF_V, LSTM_V from policies.actor import FF_Stochastic_Actor, LSTM_Stochastic_Actor import locale, os locale.setlocale(locale.LC_ALL, '') # wrapper function for creating parallelized envs env_fn = env_factory(args.env_name) obs_dim = env_fn().observation_space.shape[0] action_dim = env_fn().action_space.shape[0] # Set seeds torch.manual_seed(args.seed) np.random.seed(args.seed) std = torch.ones(action_dim) * args.std if args.recurrent: policy = LSTM_Stochastic_Actor(obs_dim, action_dim, env_name=args.env_name, fixed_std=std, bounded=False) critic = LSTM_V(obs_dim) else: policy = FF_Stochastic_Actor(obs_dim, action_dim, env_name=args.env_name, fixed_std=std, bounded=False) critic = FF_V(obs_dim) env = env_fn() eval_policy(policy, env, True, min_timesteps=args.prenormalize_steps, max_traj_len=args.traj_len, noise=1) policy.train(0) critic.train(0) algo = PPO(policy, critic, env_fn, args) # create a tensorboard logging object if not args.nolog: logger = create_logger(args) else: logger = None if args.save_actor is None and logger is not None: args.save_actor = os.path.join(logger.dir, 'actor.pt') if args.save_critic is None and logger is not None: args.save_critic = os.path.join(logger.dir, 'critic.pt') print() print("Proximal Policy Optimization:") print("\tseed: {}".format(args.seed)) print("\tenv: {}".format(args.env_name)) print("\ttimesteps: {:n}".format(int(args.timesteps))) print("\titeration steps: {:n}".format(int(args.num_steps))) print("\tprenormalize steps: {}".format(int(args.prenormalize_steps))) print("\ttraj_len: {}".format(args.traj_len)) print("\tdiscount: {}".format(args.discount)) print("\tactor_lr: {}".format(args.a_lr)) print("\tcritic_lr: {}".format(args.c_lr)) print("\tadam eps: {}".format(args.eps)) print("\tentropy coeff: {}".format(args.entropy_coeff)) print("\tgrad clip: {}".format(args.grad_clip)) print("\tbatch size: {}".format(args.batch_size)) print("\tepochs: {}".format(args.epochs)) print("\tworkers: {}".format(args.workers)) print() itr = 0 timesteps = 0 best_reward = None while timesteps < args.timesteps: kl, a_loss, c_loss, steps = algo.do_iteration( args.num_steps, args.traj_len, args.epochs, batch_size=args.batch_size, kl_thresh=args.kl) eval_reward = eval_policy(algo.actor, env, False, min_timesteps=args.traj_len * 5, max_traj_len=args.traj_len, verbose=False) timesteps += steps print("iter {:4d} | return: {:5.2f} | KL {:5.4f} | timesteps {:n}". format(itr, eval_reward, kl, timesteps)) if best_reward is None or eval_reward > best_reward: print("\t(best policy so far! saving to {})".format( args.save_actor)) best_reward = eval_reward if args.save_actor is not None: torch.save(algo.actor, args.save_actor) if args.save_critic is not None: torch.save(algo.critic, args.save_critic) if logger is not None: logger.add_scalar(args.env_name + '/kl', kl, timesteps) logger.add_scalar(args.env_name + '/return', eval_reward, timesteps) logger.add_scalar(args.env_name + '/actor loss', a_loss, timesteps) logger.add_scalar(args.env_name + '/critic loss', c_loss, timesteps) itr += 1 print("Finished ({} of {}).".format(timesteps, args.timesteps))
def run_experiment(args): torch.set_num_threads(1) from util.env import env_factory from util.log import create_logger # Start ray ray.init(num_gpus=0, include_webui=True, redis_address=args.redis_address) # Set seeds torch.manual_seed(args.seed) np.random.seed(args.seed) # wrapper function for creating parallelized envs env_fn = env_factory(args.env_name, traj=args.traj, state_est=args.state_est, dynamics_randomization=args.dyn_random, mirror=args.mirror, clock_based=args.clock_based, reward=args.reward, history=args.history) max_traj_len = args.max_traj_len obs_dim = env_fn().observation_space.shape[0] action_dim = env_fn().action_space.shape[0] max_action = 1.0 # Create replay buffer and remote logger memory_id = ReplayBuffer_remote.remote(args.replay_size, args.policy_name, args) logger_id = TD3_logger.remote(args) # Create remote learner (learner will create the evaluators) and replay buffer learner_id = Learner.remote(env_fn, memory_id, logger_id, args.max_timesteps, obs_dim, action_dim, args.a_lr, args.c_lr, batch_size=args.batch_size, discount=args.discount, update_freq=args.update_freq, evaluate_freq=args.evaluate_freq, render_policy=args.render_policy, hidden_size=args.hidden_size, env_name=args.env_name, policy_name=args.policy_name) # Create remote actors num_actors = args.num_procs - 3 # subtract replay buffer actor, learner actor, logger actor from actors_ids = [ Actor.remote(env_fn, learner_id, memory_id, logger_id, action_dim, args.start_timesteps // num_actors, args.initial_load_freq, args.taper_load_freq, args.act_noise, args.noise_scale, args.param_noise, i, hidden_size=args.hidden_size, viz_actor=args.viz_actors, env_name=args.env_name) for i in range(num_actors) ] print() print("Asynchronous Twin-Delayed Deep Deterministic policy gradients:") print("\tenv: {}".format(args.env_name)) print("\tmax traj len: {}".format(args.max_traj_len)) print("\tseed: {}".format(args.seed)) print("\tmirror: {}".format(args.mirror)) print("\tnum procs: {}".format(args.num_procs)) print("\ta_lr: {}".format(args.a_lr)) print("\tc_lr: {}".format(args.c_lr)) print("\ttau: {}".format(args.tau)) print("\tgamma: {}".format(args.discount)) print("\tact noise: {}".format(args.act_noise)) print("\tparam noise: {}".format(args.param_noise)) if (args.param_noise): print("\tnoise scale: {}".format(args.noise_scale)) print("\tbatch size: {}".format(args.batch_size)) # print("\tpolicy noise: {}".format(args.policy_noise)) # print("\tnoise clip: {}".format(args.noise_clip)) # print("\tpolicy freq: {}".format(args.policy_freq)) print("\tload freq: {}".format(args.initial_load_freq)) print("\ttaper load freq:{}".format(args.taper_load_freq)) print() start = time.time() # start collection loop for each actor futures = [actor_id.collect_experience.remote() for actor_id in actors_ids] # start training loop for learner # while True: # learner_id.update_model.remote() # TODO: make evaluator its own ray object with separate loop # futures.append(evaluator_id...) # wait for training to complete (THIS DOESN'T WORK AND I DON'T KNOW WHY) # ray.wait(futures, num_returns=len(futures)) ray.get(futures)
def evaluate(actor, obs_qbn=None, hid_qbn=None, cel_qbn=None, act_qbn=None, episodes=5, max_traj_len=1000): """ A function used for evaluating the performance of a policy network when any, some, or none of the QBNs are inserted into key points of its execution. """ with torch.no_grad(): # No gradients necessary env = env_factory(actor.env_name)() reward = 0 layertype = actor.layers[0].__class__.__name__ obs_states = {} hid_states = {} cel_states = {} act_states = {} for _ in range(episodes): done = False traj_len = 0 if hasattr(actor, 'init_hidden_state'): actor.init_hidden_state() state = torch.as_tensor(env.reset()) while not done and traj_len < max_traj_len: state = torch.as_tensor(state).float() hidden_state = actor.hidden[0] if layertype == 'LSTMCell': cell_state = actor.cells[0] if obs_qbn is not None: state = obs_qbn(state) # discretize state disc_state = np.round( obs_qbn.encode(state).numpy()).astype(int) key = ''.join(map(str, disc_state)) if key not in obs_states: obs_states[key] = True if hid_qbn is not None: # handle the hidden state QBN actor.hidden = [hid_qbn(hidden_state) ] # discretize hidden state disc_state = np.round( hid_qbn.encode(hidden_state).numpy()).astype(int) key = ''.join(map(str, disc_state[0])) if key not in hid_states: hid_states[key] = True if cel_qbn is not None: # handle the cell state QBN actor.cells = [cel_qbn(cell_state) ] # discretize cell state if using LSTM disc_state = np.round( cel_qbn.encode(cell_state).numpy()).astype(int) key = ''.join(map(str, disc_state[0])) if key not in cel_states: cel_states[key] = True action = actor(state) # run the policy to get an action if act_qbn is not None: # handle the action QBN action = act_qbn(action) # discretize action disc_state = np.round( act_qbn.encode(action).numpy()).astype(int) key = ''.join(map(str, disc_state)) if key not in act_states: act_states[key] = True state, r, done, _ = env.step(action.numpy()) traj_len += 1 reward += r / episodes if layertype == 'LSTMCell': # return cell states in addition to others if using LSTM return reward, len(obs_states), len(hid_states), len( cel_states), len(act_states) else: return reward, len(obs_states), len(hid_states), None, len( act_states)
def run_experiment(args): """ The entry point for the QBN insertion algorithm. This function is called by r2l.py, and passed an args dictionary which contains hyperparameters for running the experiment. """ locale.setlocale(locale.LC_ALL, '') from util.env import env_factory from util.log import create_logger if args.policy is None: print("You must provide a policy with --policy.") exit(1) policy = torch.load(args.policy) # load policy to be discretized layertype = policy.layers[0].__class__.__name__ if layertype != 'LSTMCell' and layertype != 'GRUCell': # ensure that the policy loaded is actually recurrent print("Cannot do QBN insertion on a non-recurrent policy.") raise NotImplementedError if len(policy.layers ) > 1: # ensure that the policy only has one hidden layer print( "Cannot do QBN insertion on a policy with more than one hidden layer." ) raise NotImplementedError # retrieve dimensions of relevant quantities env_fn = env_factory(policy.env_name) obs_dim = env_fn().observation_space.shape[0] action_dim = env_fn().action_space.shape[0] hidden_dim = policy.layers[0].hidden_size # parse QBN layer sizes from command line arg layers = [int(x) for x in args.layers.split(',')] # create QBNs obs_qbn = QBN(obs_dim, layers=layers) hidden_qbn = QBN(hidden_dim, layers=layers) action_qbn = QBN(action_dim, layers=layers) if layertype == 'LSTMCell': cell_qbn = QBN(hidden_dim, layers=layers) else: cell_qbn = None # create optimizers for all QBNs obs_optim = optim.Adam(obs_qbn.parameters(), lr=args.lr, eps=1e-6) hidden_optim = optim.Adam(hidden_qbn.parameters(), lr=args.lr, eps=1e-6) action_optim = optim.Adam(action_qbn.parameters(), lr=args.lr, eps=1e-6) if layertype == 'LSTMCell': cell_optim = optim.Adam(cell_qbn.parameters(), lr=args.lr, eps=1e-6) best_reward = None if not args.nolog: logger = create_logger(args) else: logger = None actor_dir = os.path.split(args.policy)[0] ray.init() # evaluate policy without QBNs inserted to get baseline reward n_reward, _, _, _, _ = evaluate(policy, episodes=20) logger.add_scalar(policy.env_name + '_qbn/nominal_reward', n_reward, 0) # if generated data already exists at this directory, then just load that if os.path.exists(os.path.join(actor_dir, 'train_states.pt')): train_states = torch.load(os.path.join(actor_dir, 'train_states.pt')) train_actions = torch.load(os.path.join(actor_dir, 'train_actions.pt')) train_hiddens = torch.load(os.path.join(actor_dir, 'train_hiddens.pt')) test_states = torch.load(os.path.join(actor_dir, 'test_states.pt')) test_actions = torch.load(os.path.join(actor_dir, 'test_actions.pt')) test_hiddens = torch.load(os.path.join(actor_dir, 'test_hiddens.pt')) if layertype == 'LSTMCell': train_cells = torch.load(os.path.join(actor_dir, 'train_cells.pt')) test_cells = torch.load(os.path.join(actor_dir, 'test_cells.pt')) else: # if no data exists and we need to generate some start = time.time() data = ray.get([ collect_data.remote(policy, args.dataset / args.workers, 400, np.random.randint(65535)) for _ in range(args.workers) ]) states = torch.from_numpy(np.vstack([r[0] for r in data])) actions = torch.from_numpy(np.vstack([r[1] for r in data])) hiddens = torch.from_numpy(np.vstack([r[2] for r in data])) if layertype == 'LSTMCell': cells = torch.from_numpy(np.vstack([r[3] for r in data])) split = int(0.8 * len(states)) # 80/20 train test split train_states, test_states = states[:split], states[split:] train_actions, test_actions = actions[:split], actions[split:] train_hiddens, test_hiddens = hiddens[:split], hiddens[split:] if layertype == 'LSTMCell': train_cells, test_cells = cells[:split], cells[split:] print( "{:3.2f} to collect {} timesteps. Training set is {}, test set is {}" .format(time.time() - start, len(states), len(train_states), len(test_states))) torch.save(train_states, os.path.join(actor_dir, 'train_states.pt')) torch.save(train_actions, os.path.join(actor_dir, 'train_actions.pt')) torch.save(train_hiddens, os.path.join(actor_dir, 'train_hiddens.pt')) if layertype == 'LSTMCell': torch.save(train_cells, os.path.join(actor_dir, 'train_cells.pt')) torch.save(test_states, os.path.join(actor_dir, 'test_states.pt')) torch.save(test_actions, os.path.join(actor_dir, 'test_actions.pt')) torch.save(test_hiddens, os.path.join(actor_dir, 'test_hiddens.pt')) if layertype == 'LSTMCell': torch.save(test_cells, os.path.join(actor_dir, 'test_cells.pt')) # run the nominal QBN training algorithm via unsupervised learning on the dataset for epoch in range(args.epochs): random_indices = SubsetRandomSampler(range(train_states.shape[0])) sampler = BatchSampler(random_indices, args.batch_size, drop_last=False) epoch_obs_losses = [] epoch_hid_losses = [] epoch_act_losses = [] epoch_cel_losses = [] for i, batch in enumerate(sampler): # get batch inputs from dataset batch_states = train_states[batch] batch_actions = train_actions[batch] batch_hiddens = train_hiddens[batch] if layertype == 'LSTMCell': batch_cells = train_cells[batch] # do forward pass to create derivative graph obs_loss = 0.5 * (batch_states - obs_qbn(batch_states)).pow(2).mean() hid_loss = 0.5 * (batch_hiddens - hidden_qbn(batch_hiddens)).pow(2).mean() act_loss = 0.5 * (batch_actions - action_qbn(batch_actions)).pow(2).mean() if layertype == 'LSTMCell': cel_loss = 0.5 * (batch_cells - cell_qbn(batch_cells)).pow(2).mean() # gradient calculation and parameter updates obs_optim.zero_grad() obs_loss.backward() obs_optim.step() hidden_optim.zero_grad() hid_loss.backward() hidden_optim.step() action_optim.zero_grad() act_loss.backward() action_optim.step() if layertype == 'LSTMCell': cell_optim.zero_grad() cel_loss.backward() cell_optim.step() epoch_obs_losses.append(obs_loss.item()) epoch_hid_losses.append(hid_loss.item()) epoch_act_losses.append(act_loss.item()) if layertype == 'LSTMCell': epoch_cel_losses.append(cel_loss.item()) print("epoch {:3d} / {:3d}, batch {:3d} / {:3d}".format( epoch + 1, args.epochs, i + 1, len(sampler)), end='\r') epoch_obs_losses = np.mean(epoch_obs_losses) epoch_hid_losses = np.mean(epoch_hid_losses) epoch_act_losses = np.mean(epoch_act_losses) if layertype == 'LSTMCell': epoch_cel_losses = np.mean(epoch_cel_losses) # collect some statistics about performance on the test set with torch.no_grad(): state_loss = 0.5 * (test_states - obs_qbn(test_states)).pow(2).mean() hidden_loss = 0.5 * (test_hiddens - hidden_qbn(test_hiddens)).pow(2).mean() act_loss = 0.5 * (test_actions - action_qbn(test_actions)).pow(2).mean() if layertype == 'LSTMCell': cell_loss = 0.5 * (test_cells - cell_qbn(test_cells)).pow(2).mean() # evaluate QBN performance one-by-one print("\nEvaluating...") d_reward, s_states, h_states, c_states, a_states = evaluate( policy, obs_qbn=obs_qbn, hid_qbn=hidden_qbn, cel_qbn=cell_qbn, act_qbn=action_qbn) c_reward = 0.0 if layertype == 'LSTMCell': c_reward, _, _, _, _ = evaluate(policy, obs_qbn=None, hid_qbn=None, cel_qbn=cell_qbn, act_qbn=None) h_reward, _, _, _, _ = evaluate(policy, obs_qbn=None, hid_qbn=hidden_qbn, cel_qbn=None, act_qbn=None) s_reward, _, _, _, _ = evaluate(policy, obs_qbn=obs_qbn, hid_qbn=None, cel_qbn=None, act_qbn=None) a_reward, _, _, _, _ = evaluate(policy, obs_qbn=None, hid_qbn=None, cel_qbn=None, act_qbn=action_qbn) if best_reward is None or d_reward > best_reward: torch.save(obs_qbn, os.path.join(logger.dir, 'obsqbn.pt')) torch.save(hidden_qbn, os.path.join(logger.dir, 'hidqbn.pt')) if layertype == 'LSTMCell': torch.save(cell_qbn, os.path.join(logger.dir, 'celqbn.pt')) if layertype == 'LSTMCell': print("Losses: {:7.5f} {:7.5f} {:7.5f}".format( state_loss, hidden_loss, cell_loss)) print("States: {:5d} {:5d} {:5d}".format(s_states, h_states, c_states)) print( "QBN reward: {:5.1f} ({:5.1f}, {:5.1f}, {:5.1f}, {:5.1f}) | Nominal reward {:5.0f} " .format(d_reward, h_reward, s_reward, c_reward, a_reward, n_reward)) else: print("Losses: {:7.5f} {:7.5f}".format(state_loss, hidden_loss)) print("States: {:5d} {:5d} ".format(s_states, h_states)) print( "QBN reward: {:5.1f} ({:5.1f}, {:5.1f}, {:5.1f}) | Nominal reward {:5.0f} " .format(d_reward, h_reward, s_reward, a_reward, n_reward)) if logger is not None: logger.add_scalar(policy.env_name + '_qbn/obs_loss', state_loss, epoch) logger.add_scalar(policy.env_name + '_qbn/hidden_loss', hidden_loss, epoch) logger.add_scalar(policy.env_name + '_qbn/qbn_reward', d_reward, epoch) if layertype == 'LSTMCell': logger.add_scalar(policy.env_name + '_qbn/cell_loss', cell_loss, epoch) logger.add_scalar(policy.env_name + '_qbn/cellonly_reward', c_reward, epoch) logger.add_scalar(policy.env_name + '_qbn/cell_states', c_states, epoch) logger.add_scalar(policy.env_name + '_qbn/obsonly_reward', s_reward, epoch) logger.add_scalar(policy.env_name + '_qbn/hiddenonly_reward', h_reward, epoch) logger.add_scalar(policy.env_name + '_qbn/actiononly_reward', a_reward, epoch) logger.add_scalar(policy.env_name + '_qbn/observation_states', s_states, epoch) logger.add_scalar(policy.env_name + '_qbn/hidden_states', h_states, epoch) logger.add_scalar(policy.env_name + '_qbn/action_states', a_states, epoch) print("Training phase over. Beginning finetuning.") # initialize new optimizers, since the gradient magnitudes will likely change as we are calculating a different quantity. obs_optim = optim.Adam(obs_qbn.parameters(), lr=args.lr, eps=1e-6) hidden_optim = optim.Adam(hidden_qbn.parameters(), lr=args.lr, eps=1e-6) if layertype == 'LSTMCell': cell_optim = optim.Adam(cell_qbn.parameters(), lr=args.lr, eps=1e-6) optims = [obs_optim, hidden_optim, cell_optim, action_optim] else: optims = [obs_optim, hidden_optim, action_optim] optims = [action_optim] # run the finetuning portion of the QBN algorithm. for fine_iter in range(args.iterations): losses = [] for ep in range(args.episodes): env = env_fn() state = torch.as_tensor(env.reset()) done = False traj_len = 0 if hasattr(policy, 'init_hidden_state'): policy.init_hidden_state() reward = 0 while not done and traj_len < args.traj_len: with torch.no_grad(): state = torch.as_tensor(state).float() hidden = policy.hidden[0] #policy.hidden = [hidden_qbn(hidden)] if layertype == 'LSTMCell': cell = policy.cells[0] #policy.cells = [cell_qbn(cell)] # Compute qbn values qbn_action = action_qbn(policy(obs_qbn(state))) with torch.no_grad(): policy.hidden = [hidden] if layertype == 'LSTMCell': policy.cells = [cell] action = policy(state) state, r, done, _ = env.step(action.numpy()) reward += r traj_len += 1 step_loss = 0.5 * (action - qbn_action).pow( 2 ) # this creates the derivative graph for our backwards pass losses += [step_loss] # clear our parameter gradients for opt in optims: opt.zero_grad() # run the backwards pass losses = torch.stack(losses).mean() losses.backward() # update parameters for opt in optims: opt.step() # evaluate our QBN performance one-by-one print("\nEvaluating...") d_reward, s_states, h_states, c_states, a_states = evaluate( policy, obs_qbn=obs_qbn, hid_qbn=hidden_qbn, cel_qbn=cell_qbn, act_qbn=action_qbn) c_reward = 0.0 if layertype == 'LSTMCell': c_reward, _, _, _, _ = evaluate(policy, obs_qbn=None, hid_qbn=None, cel_qbn=cell_qbn, act_qbn=None) h_reward, _, _, _, _ = evaluate(policy, obs_qbn=None, hid_qbn=hidden_qbn, cel_qbn=None, act_qbn=None) s_reward, _, _, _, _ = evaluate(policy, obs_qbn=obs_qbn, hid_qbn=None, cel_qbn=None, act_qbn=None) a_reward, _, _, _, _ = evaluate(policy, obs_qbn=None, hid_qbn=None, cel_qbn=None, act_qbn=action_qbn) if layertype == 'LSTMCell': print("Finetuning loss: {:7.5f}".format(losses)) print("States: {:5d} {:5d} {:5d}".format(s_states, h_states, c_states)) print( "QBN reward: {:5.1f} ({:5.1f}, {:5.1f}, {:5.1f}) | Nominal reward {:5.0f} " .format(d_reward, h_reward, s_reward, c_reward, a_reward, n_reward)) else: print("Losses: {:7.5f} {:7.5f}".format(epoch_obs_losses, epoch_hid_losses)) print("States: {:5d} {:5d} ".format(s_states, h_states)) print( "QBN reward: {:5.1f} ({:5.1f}, {:5.1f}) | Nominal reward {:5.0f} " .format(d_reward, h_reward, s_reward, a_reward, n_reward)) if logger is not None: logger.add_scalar(policy.env_name + '_qbn/finetune_loss', losses.item(), epoch + fine_iter) logger.add_scalar(policy.env_name + '_qbn/qbn_reward', d_reward, epoch + fine_iter) if layertype == 'LSTMCell': logger.add_scalar(policy.env_name + '_qbn/cellonly_reward', c_reward, epoch + fine_iter) logger.add_scalar(policy.env_name + '_qbn/cell_states', c_states, epoch + fine_iter) logger.add_scalar(policy.env_name + '_qbn/obsonly_reward', s_reward, epoch + fine_iter) logger.add_scalar(policy.env_name + '_qbn/hiddenonly_reward', h_reward, epoch + fine_iter) logger.add_scalar(policy.env_name + '_qbn/actiononly_reward', a_reward, epoch + fine_iter) logger.add_scalar(policy.env_name + '_qbn/observation_states', s_states, epoch + fine_iter) logger.add_scalar(policy.env_name + '_qbn/hidden_states', h_states, epoch + fine_iter) logger.add_scalar(policy.env_name + '_qbn/action_states', a_states, epoch + fine_iter) if best_reward is None or d_reward > best_reward: torch.save(obs_qbn, os.path.join(logger.dir, 'obsqbn.pt')) torch.save(hidden_qbn, os.path.join(logger.dir, 'hidqbn.pt')) torch.save(cell_qbn, os.path.join(logger.dir, 'celqbn.pt'))
def run_experiment(args): from policies.critic import FF_Q, LSTM_Q, GRU_Q from policies.actor import FF_Stochastic_Actor, LSTM_Stochastic_Actor, GRU_Stochastic_Actor, FF_Actor, LSTM_Actor, GRU_Actor locale.setlocale(locale.LC_ALL, '') # wrapper function for creating parallelized envs env = env_factory(args.env)() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if hasattr(env, 'seed'): env.seed(args.seed) obs_space = env.observation_space.shape[0] act_space = env.action_space.shape[0] replay_buff = ReplayBuffer(args.buffer) layers = [int(x) for x in args.layers.split(',')] if args.arch == 'lstm': q1 = LSTM_Q(obs_space, act_space, env_name=args.env, layers=layers) q2 = LSTM_Q(obs_space, act_space, env_name=args.env, layers=layers) if args.algo == 'sac': actor = LSTM_Stochastic_Actor(obs_space, act_space, env_name=args.env, bounded=True, layers=layers) else: actor = LSTM_Actor(obs_space, act_space, env_name=args.env, layers=layers) elif args.arch == 'gru': q1 = GRU_Q(obs_space, act_space, env_name=args.env, layers=layers) q2 = GRU_Q(obs_space, act_space, env_name=args.env, layers=layers) if args.algo == 'sac': actor = GRU_Stochastic_Actor(obs_space, act_space, env_name=args.env, bounded=True, layers=layers) else: actor = GRU_Actor(obs_space, act_space, env_name=args.env, layers=layers) elif args.arch == 'ff': q1 = FF_Q(obs_space, act_space, env_name=args.env, layers=layers) q2 = FF_Q(obs_space, act_space, env_name=args.env, layers=layers) if args.algo == 'sac': actor = FF_Stochastic_Actor(obs_space, act_space, env_name=args.env, bounded=True, layers=layers) else: actor = FF_Actor(obs_space, act_space, env_name=args.env, layers=layers) if args.algo == 'sac': print('Soft Actor-Critic') algo = SAC(actor, q1, q2, torch.prod(torch.Tensor(env.reset().shape)), args) elif args.algo == 'td3': print('Twin-Delayed Deep Deterministic Policy Gradient') algo = TD3(actor, q1, q2, args) elif args.algo == 'ddpg': print('Deep Deterministic Policy Gradient') algo = DDPG(actor, q1, args) print("\tenv: {}".format(args.env)) print("\tseed: {}".format(args.seed)) print("\ttimesteps: {:n}".format(args.timesteps)) print("\tactor_lr: {}".format(args.a_lr)) print("\tcritic_lr: {}".format(args.c_lr)) print("\tdiscount: {}".format(args.discount)) print("\ttau: {}".format(args.tau)) print("\tbatch_size: {}".format(args.batch_size)) print("\twarmup period: {:n}".format(args.start_timesteps)) print("\tworkers: {}".format(args.workers)) print("\tlayers: {}".format(args.layers)) print() # create a tensorboard logging object logger = create_logger(args) if args.save_actor is None: args.save_actor = os.path.join(logger.dir, 'actor.pt') # Keep track of some statistics for each episode training_start = time.time() episode_start = time.time() best_reward = None if not ray.is_initialized(): #if args.redis is not None: # ray.init(redis_address=args.redis) #else: # ray.init(num_cpus=args.workers) ray.init(num_cpus=args.workers) workers = [ Off_Policy_Worker.remote(actor, env_factory(args.env)) for _ in range(args.workers) ] train_normalizer(algo.actor, args.prenormalize_steps, noise=algo.expl_noise) # Fill replay buffer, update policy until n timesteps have passed timesteps, i = 0, 0 state = env.reset().astype(np.float32) while i < args.iterations: if timesteps < args.timesteps: actor_param_id = ray.put(list(algo.actor.parameters())) norm_id = ray.put([ algo.actor.welford_state_mean, algo.actor.welford_state_mean_diff, algo.actor.welford_state_n ]) for w in workers: w.sync_policy.remote(actor_param_id, input_norm=norm_id) buffers = ray.get([ w.collect_episode.remote(args.expl_noise, args.traj_len) for w in workers ]) replay_buff.merge_with(buffers) timesteps += sum(len(b.states) for b in buffers) #for i in range(len(replay_buff.traj_idx)-1): # for j in range(replay_buff.traj_idx[i], replay_buff.traj_idx[i+1]): # print("traj {:2d} timestep {:3d}, not done {}, reward {},".format(i, j, replay_buff.not_dones[j], replay_buff.rewards[j])) if (algo.recurrent and len(replay_buff.traj_idx) > args.batch_size ) or (not algo.recurrent and replay_buff.size > args.batch_size): i += 1 loss = [] for _ in range(args.updates): loss.append( algo.update_policy(replay_buff, batch_size=args.batch_size)) loss = np.mean(loss, axis=0) print('algo {:4s} | explored: {:5n} of {:5n}'.format( args.algo, timesteps, args.timesteps), end=' | ') if args.algo == 'ddpg': print( 'iteration {:6n} | actor loss {:6.4f} | critic loss {:6.4f} | buffer size {:6n} / {:6n} ({:4n} trajectories) | {:60s}' .format(i + 1, loss[0], loss[1], replay_buff.size, replay_buff.max_size, len(replay_buff.traj_idx), '')) logger.add_scalar(args.env + '/actor loss', loss[0], i) logger.add_scalar(args.env + '/critic loss', loss[1], i) if args.algo == 'td3': print( 'iteration {:6n} | actor loss {:6.4f} | critic loss {:6.4f} | buffer size {:6n} / {:6n} ({:4n} trajectories) | {:60s}' .format(i + 1, loss[0], loss[1], replay_buff.size, replay_buff.max_size, len(replay_buff.traj_idx), '')) logger.add_scalar(args.env + '/actor loss', loss[0], i) logger.add_scalar(args.env + '/critic loss', loss[1], i) if i % args.eval_every == 0 and iter != 0: eval_reward = eval_policy(algo.actor, env, 5, args.traj_len) logger.add_scalar(args.env + '/return', eval_reward, i) print("evaluation after {:4d} iterations | return: {:7.3f}". format(i, eval_reward, '')) if best_reward is None or eval_reward > best_reward: torch.save(algo.actor, args.save_actor) best_reward = eval_reward print("\t(best policy so far! saving to {})".format( args.save_actor)) else: if algo.recurrent: print( "Collected {:5d} of {:5d} warmup trajectories \t\t".format( len(replay_buff.traj_idx), args.batch_size), end='\r') else: print( "Collected {:5d} of {:5d} warmup trajectories \t\t".format( replay_buff.size, args.batch_size), end='\r') """
help="Size of increments for the sensityivity sweep") parser.add_argument("--hi_factor", type=float, default=15, help="High factor") parser.add_argument("--lo_factor", type=float, default=0, help="Low factor") args = parser.parse_args() run_args = pickle.load(open(os.path.join(args.path, "experiment.pkl"), "rb")) # cassie_env = CassieEnv(traj=run_args.traj, clock_based=run_args.clock_based, state_est=run_args.state_est, dynamics_randomization=run_args.dyn_random) # env_fn = partial(CassieEnv, traj=run_args.traj, clock_based=run_args.clock_based, state_est=run_args.state_est, dynamics_randomization=run_args.dyn_random) # Make mirror False so that env_factory returns a regular wrap env function and not a symmetric env function that can be called to return # a cassie environment (symmetric env cannot be called to make another env) env_fn = env = env_factory(run_args.env_name, command_profile=run_args.command_profile, input_profile=run_args.input_profile, simrate=run_args.simrate, dynamics_randomization=run_args.dyn_random, mirror=run_args.mirror, learn_gains=run_args.learn_gains, reward=run_args.reward, history=run_args.history, no_delta=run_args.no_delta, traj=run_args.traj, ik_baseline=run_args.ik_baseline) cassie_env = env_fn() policy = torch.load(os.path.join(args.path, "actor.pt")) if args.eval: policy.eval() if hasattr(policy, 'init_hidden_state'): policy.init_hidden_state() # TODO: make returning/save data in file inside function consist for all testing functions def test_commands(cassie_env, policy, args):
def run_udp(policy_files): from util.env import env_factory policies = [torch.load(p) for p in policy_files] m_policies = [torch.load(p) for p in policy_files] legacies = [ 'legacy' if not (hasattr(p, 'legacy') and p.legacy == False) else '' for p in policies ] envs = [ env_factory(p.env_name + legacy)() for p, legacy in zip(policies, legacies) ] for i, env in enumerate(envs): if not env.state_est: print( "{} was not trained with state estimation and cannot be run on the robot." .format(policy_files[i])) raise RuntimeError for i, p in enumerate(policies): print("Policy {:3d} is a: {}".format(i, p.__class__.__name__)) time.sleep(1) time_log = [] # time stamp input_log = [] # network inputs output_log = [] # network outputs state_log = [] # cassie state target_log = [] # PD target log env = envs[0] clock_based = env.clock no_delta = True u = pd_in_t() for i in range(5): u.leftLeg.motorPd.pGain[i] = env.P[i] u.leftLeg.motorPd.dGain[i] = env.D[i] u.rightLeg.motorPd.pGain[i] = env.P[i] u.rightLeg.motorPd.dGain[i] = env.D[i] if platform.node() == 'cassie': cassie = CassieUdp(remote_addr='10.10.10.3', remote_port='25010', local_addr='10.10.10.100', local_port='25011') else: cassie = CassieUdp() # local testing print('Connecting...') y = None while y is None: cassie.send_pd(pd_in_t()) time.sleep(0.001) y = cassie.recv_newest_pd() policy_idx = None print('Connected!\n') # Whether or not STO has been TOGGLED (i.e. it does not count the initial STO condition) # STO = True means that STO is ON (i.e. robot is not running) and STO = False means that STO is # OFF (i.e. robot *is* running) ESTOP = False # We have multiple modes of operation # 0: Normal operation, walking with policy # 1: Zero out memory of policy (if applicable) # 2: Stop, drop, and hopefully not roll, damping mode with no P gain operation_mode = 0 # Command inputs speed = 0 side_speed = 0 orient_add = 0 phase_add = env.simrate phase = 0 D_mult = 1 actual_speed = 0 delay = 30 counter = 0 pitch_bias = 0 ESTOP_count = 0 max_speed = 2.00 min_speed = -0.30 max_y_speed = 0.25 min_y_speed = -0.25 logged = True for policy in policies: if hasattr(policy, 'init_hidden_state'): policy.init_hidden_state() for policy in m_policies: if hasattr(policy, 'init_hidden_state'): policy.init_hidden_state() old_settings = termios.tcgetattr(sys.stdin) try: tty.setcbreak(sys.stdin.fileno()) t = time.monotonic() t0 = t with torch.no_grad(): while True: t = time.monotonic() tt = time.monotonic() - t0 # Get newest state state = cassie.recv_newest_pd() if state is None: print('Missed a cycle! ') continue if platform.node() == 'cassie': """ Control of the physical robot using a wireless handheld controller. """ # Switch the operation mode based on the toggle next to STO if state.radio.channel[ 9] < -0.5: # towards operator means damping shutdown mode operation_mode = 2 elif state.radio.channel[ 9] > 0.5: # away from the operator means zero hidden states operation_mode = 1 else: # Middle means normal walking operation_mode = 0 # Radio control orient_add -= state.radio.channel[3] / 60.0 # Reset orientation on STO if state.radio.channel[8] < 0: orient_add = quaternion2euler( state.pelvis.orientation[:])[2] ESTOP = True else: ESTOP = False logged = False raw_spd = (state.radio.channel[6] + 1) / 2 + state.radio.channel[0] / 5 speed = raw_spd * max_speed if raw_spd > 0 else -raw_spd * min_speed raw_side_spd = -state.radio.channel[1] side_speed = raw_side_spd * max_y_speed if raw_side_spd > 0 else -raw_side_spd * min_y_speed phase_add = env.simrate + env.simrate * ( state.radio.channel[7] + 0.75) / 2 pitch_bias = state.radio.channel[5] / 6 policy_idx = int(state.radio.channel[10]) if policy_idx == -1: policy_idx = None else: """ Control of the robot in simulation using a keyboard. """ tt = time.monotonic() - t0 if check_stdin(): c = sys.stdin.read(1) if c == 'w': speed += 0.1 phase_add = np.clip(phase_add, (0.6 * speed) * 60, None) if c == 's': speed -= 0.1 if c == 'q': orient_add -= 0.1 if c == 'e': orient_add += 0.1 if c == 'a': side_speed -= 0.05 if c == 'd': side_speed += 0.05 if c == 'r': speed = 0.5 orient_add = 0 side_speed = 0 if c == 't': phase_add += 1 if c == 'g': phase_add -= 1 if c.isdigit(): if int(c) > len(policies) - 1: policy_idx = None else: policy_idx = int(c) if c == 'x': for policy in policies: policy.init_hidden_state() ESTOP = not ESTOP logged = False side_speed = max(min_y_speed, side_speed) side_speed = min(max_y_speed, side_speed) if ESTOP: # Save log files after STO toggle (skipping first STO) if not logged: logged = True #log(ESTOP_count) data = { "time": time_log, "output": output_log, "input": input_log, "state": state_log, "target": target_log } fname = 'log_' + \ datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H:%M') + \ '_' + str(datetime.timedelta(seconds=round(tt))) + \ '.pkl' print() print(fname) print() filep = open(fname, 'wb') pickle.dump(data, filep) filep.close() ESTOP_count += 1 # Clear out logs time_log = [] input_log = [] output_log = [] state_log = [] target_log = [] t0 = time.monotonic() for policy in policies: if hasattr(policy, 'init_hidden_state'): policy.init_hidden_state() #------------------------------- Normal Walking --------------------------- if operation_mode == 1: for policy in policies: if hasattr(policy, 'init_hidden_state'): policy.init_hidden_state() # Rotate pelvis orientation to turn robot quaternion = euler2quat(z=orient_add, y=0, x=0) iquaternion = inverse_quaternion(quaternion) new_orient = quaternion_product(iquaternion, state.pelvis.orientation[:]) clock = [ np.sin(2 * np.pi * phase / len(env.trajectory)), np.cos(2 * np.pi * phase / len(env.trajectory)) ] motor_pos = state.motor.position[:] joint_pos = state.joint.position[:] motor_vel = state.motor.velocity[:] joint_vel = state.joint.velocity[:] joint_pos = joint_pos[:2] + joint_pos[ 3:5] # remove double-counted joint/motor positions joint_vel = joint_vel[:2] + joint_vel[3:5] if new_orient[0] < 0: new_orient = [-1 * x for x in new_orient] if env.clock: ext_state = np.concatenate((clock, [speed, side_speed])) else: ext_state = np.concatenate(([speed], [side_speed])) pelvis_vel = rotate_by_quaternion( state.pelvis.translationalVelocity[:], iquaternion) pelvis_rvel = state.pelvis.rotationalVelocity[:] pelvis_hgt = state.pelvis.position[2] - state.terrain.height robot_state = np.concatenate([ new_orient, # pelvis orientation motor_pos, # actuated joint positions pelvis_vel, # pelvis translational velocity pelvis_rvel, # pelvis rotational velocity motor_vel, # actuated joint velocities joint_pos, # unactuated joint positions joint_vel # unactuated joint velocities ]) if operation_mode == 2 or ESTOP: mode = 'DAMP' elif operation_mode == 1: mode = 'WIPE' elif operation_mode == 0: mode = 'WALK' actual_speed = 0.9 * actual_speed + 0.1 * pelvis_vel[0] RL_state = np.concatenate([robot_state, ext_state]) mirror_RL_state = env.mirror_state(RL_state) # Construct input vector torch_states = [torch.Tensor(RL_state) for _ in policies] mirror_torch_states = [ torch.Tensor(mirror_RL_state) for _ in policies ] if no_delta: offset = env.offset else: offset = env.get_ref_state(phase=phase)[0][env.pos_idx] actions = [ policy(state).numpy() for state, policy in zip(torch_states, policies) ] mirror_actions = [ env.mirror_action(policy(state).numpy()) for state, policy in zip(mirror_torch_states, m_policies) ] if mirror: env_action = [(a + m) / 2 for a, m in zip(actions, mirror_actions)] else: env_action = actions targets = [action[:10] + offset for action in env_action] p_gains = [ action[10:20] if len(action) > 10 else np.zeros(10) for action in env_action ] d_gains = [ action[20:30] if len(action) > 20 else np.zeros(10) for action in env_action ] if policy_idx is None: target = np.mean(targets, axis=0) p_gain = np.mean(p_gains, axis=0) d_gain = np.mean(d_gains, axis=0) else: target = targets[policy_idx] p_gain = p_gains[policy_idx] d_gain = d_gains[policy_idx] print( "MODE {:10s} | IDX {} | Des. Spd. {:5.2f} | Speed {:5.1f} | Sidespeed {:4.1f} | Heading {:5.1f} | Freq. {:3d} | Delay {:6.3f} | {:6.4f} {:20s}" .format(mode, policy_idx, speed, actual_speed, side_speed, orient_add, int(phase_add), delay, np.max(p_gain), ''), end='\r') if ESTOP or operation_mode == 2: for i in range(5): u.leftLeg.motorPd.pGain[i] = 0.001 u.leftLeg.motorPd.dGain[i] = D_mult * env.D[i] u.rightLeg.motorPd.pGain[i] = 0.001 u.rightLeg.motorPd.dGain[i] = D_mult * env.D[i] u.leftLeg.motorPd.pTarget[i] = 0.001 u.rightLeg.motorPd.pTarget[i] = 0.001 else: # Send action for i in range(5): u.leftLeg.motorPd.pGain[i] = env.P[i] + p_gain[i] u.leftLeg.motorPd.dGain[i] = env.D[i] + d_gain[i] u.rightLeg.motorPd.pGain[i] = env.P[i] + p_gain[i + 5] u.rightLeg.motorPd.dGain[i] = env.D[i] + d_gain[i + 5] u.leftLeg.motorPd.pTarget[i] = target[i] u.rightLeg.motorPd.pTarget[i] = target[i + 5] time_log.append(time.time()) state_log.append(state) input_log.append(RL_state) output_log.append(env_action) target_log.append(target) cassie.send_pd(u) while time.monotonic() - t < 0.03: time.sleep(0.001) delay = (time.monotonic() - t) * 1000 # Track phase phase += phase_add if phase >= len(env.trajectory): phase = phase % len(env.trajectory) - 1 counter += 1 finally: termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
def run_experiment(args): from util.env import env_factory from util.log import create_logger # wrapper function for creating parallelized envs env_fn = env_factory(args.env_name, simrate=args.simrate, command_profile=args.command_profile, input_profile=args.input_profile, learn_gains=args.learn_gains, dynamics_randomization=args.dyn_random, reward=args.reward, history=args.history, mirror=args.mirror, ik_baseline=args.ik_baseline, no_delta=args.no_delta, traj=args.traj) obs_dim = env_fn().observation_space.shape[0] action_dim = env_fn().action_space.shape[0] # Set up Parallelism os.environ['OMP_NUM_THREADS'] = '1' if not ray.is_initialized(): if args.redis_address is not None: ray.init(num_cpus=args.num_procs, redis_address=args.redis_address) else: ray.init(num_cpus=args.num_procs) # Set seeds torch.manual_seed(args.seed) np.random.seed(args.seed) if args.previous is not None: policy = torch.load(os.path.join(args.previous, "actor.pt")) critic = torch.load(os.path.join(args.previous, "critic.pt")) # TODO: add ability to load previous hyperparameters, if this is something that we event want # with open(args.previous + "experiment.pkl", 'rb') as file: # args = pickle.loads(file.read()) print("loaded model from {}".format(args.previous)) else: if args.recurrent: policy = Gaussian_LSTM_Actor(obs_dim, action_dim, fixed_std=np.exp(-2), env_name=args.env_name) critic = LSTM_V(obs_dim) else: if args.learn_stddev: policy = Gaussian_FF_Actor(obs_dim, action_dim, fixed_std=None, env_name=args.env_name, bounded=args.bounded) else: policy = Gaussian_FF_Actor(obs_dim, action_dim, fixed_std=np.exp(args.std_dev), env_name=args.env_name, bounded=args.bounded) critic = FF_V(obs_dim) with torch.no_grad(): policy.obs_mean, policy.obs_std = map( torch.Tensor, get_normalization_params(iter=args.input_norm_steps, noise_std=1, policy=policy, env_fn=env_fn, procs=args.num_procs)) critic.obs_mean = policy.obs_mean critic.obs_std = policy.obs_std policy.train() critic.train() print("obs_dim: {}, action_dim: {}".format(obs_dim, action_dim)) # create a tensorboard logging object logger = create_logger(args) algo = PPO(args=vars(args), save_path=logger.dir) print() print("Synchronous Distributed Proximal Policy Optimization:") print(" ├ recurrent: {}".format(args.recurrent)) print(" ├ run name: {}".format(args.run_name)) print(" ├ max traj len: {}".format(args.max_traj_len)) print(" ├ seed: {}".format(args.seed)) print(" ├ num procs: {}".format(args.num_procs)) print(" ├ lr: {}".format(args.lr)) print(" ├ eps: {}".format(args.eps)) print(" ├ lam: {}".format(args.lam)) print(" ├ gamma: {}".format(args.gamma)) print(" ├ learn stddev: {}".format(args.learn_stddev)) print(" ├ std_dev: {}".format(args.std_dev)) print(" ├ entropy coeff: {}".format(args.entropy_coeff)) print(" ├ clip: {}".format(args.clip)) print(" ├ minibatch size: {}".format(args.minibatch_size)) print(" ├ epochs: {}".format(args.epochs)) print(" ├ num steps: {}".format(args.num_steps)) print(" ├ use gae: {}".format(args.use_gae)) print(" ├ max grad norm: {}".format(args.max_grad_norm)) print(" └ max traj len: {}".format(args.max_traj_len)) print() algo.train(env_fn, policy, critic, args.n_itr, logger=logger, anneal_rate=args.anneal)
def run_experiment(args): # wrapper function for creating parallelized envs env_thunk = env_factory(args.env_name) with env_thunk() as env: obs_space = env.observation_space.shape[0] act_space = env.action_space.shape[0] # wrapper function for creating parallelized policies def policy_thunk(): from rl.policies.actor import FF_Actor, LSTM_Actor, Linear_Actor if args.load_model is not None: return torch.load(args.load_model) else: if not args.recurrent: policy = Linear_Actor(obs_space, act_space, hidden_size=args.hidden_size).float() else: policy = LSTM_Actor(obs_space, act_space, hidden_size=args.hidden_size).float() # policy parameters should be zero initialized according to ARS paper for p in policy.parameters(): p.data = torch.zeros(p.shape) return policy # the 'black box' function that will get passed into ARS def eval_fn(policy, env, reward_shift, traj_len, visualize=False, normalize=False): if hasattr(policy, 'init_hidden_state'): policy.init_hidden_state() state = torch.tensor(env.reset()).float() rollout_reward = 0 done = False timesteps = 0 while not done and timesteps < traj_len: if normalize: state = policy.normalize_state(state) action = policy.forward(state).detach().numpy() state, reward, done, _ = env.step(action) state = torch.tensor(state).float() rollout_reward += reward - reward_shift timesteps += 1 return rollout_reward, timesteps import locale locale.setlocale(locale.LC_ALL, '') print("Augmented Random Search:") print("\tenv: {}".format(args.env_name)) print("\tseed: {}".format(args.seed)) print("\ttimesteps: {:n}".format(args.timesteps)) print("\tstd: {}".format(args.std)) print("\tdeltas: {}".format(args.deltas)) print("\tstep size: {}".format(args.lr)) print("\treward shift: {}".format(args.reward_shift)) print() algo = ARS(policy_thunk, env_thunk, deltas=args.deltas, step_size=args.lr, std=args.std, workers=args.workers, redis_addr=args.redis) if args.algo not in ['v1', 'v2']: print("Valid arguments for --algo are 'v1' and 'v2'") exit(1) elif args.algo == 'v2': normalize_states = True else: normalize_states = False def black_box(p, env): return eval_fn(p, env, args.reward_shift, args.traj_len, normalize=normalize_states) avg_reward = 0 timesteps = 0 i = 0 logger = create_logger(args) # if args.save_model is None: # args.save_model = os.path.join(logger.dir, 'actor.pt') args.save_model = os.path.join(logger.dir, 'actor.pt') env = env_thunk() while timesteps < args.timesteps: if not i % args.average_every: avg_reward = 0 print() start = time.time() samples = algo.step(black_box) elapsed = time.time() - start iter_reward = 0 for eval_rollout in range(10): reward, _ = eval_fn(algo.policy, env, 0, args.traj_len, normalize=normalize_states) iter_reward += reward / 10 timesteps += samples avg_reward += iter_reward secs_per_sample = 1000 * elapsed / samples print(("iter {:4d} | " "ret {:6.2f} | " "last {:3d} iters: {:6.2f} | " "{:0.4f}s per 1k steps | " "timesteps {:10n}").format(i+1, \ iter_reward, (i%args.average_every)+1, \ avg_reward/((i%args.average_every)+1), \ secs_per_sample, timesteps), \ end="\r") i += 1 logger.add_scalar('eval', iter_reward, timesteps) torch.save(algo.policy, args.save_model)
def run_experiment(args): from policies.critic import FF_Q, LSTM_Q from policies.actor import FF_Stochastic_Actor, LSTM_Stochastic_Actor, FF_Actor, LSTM_Actor locale.setlocale(locale.LC_ALL, '') # wrapper function for creating parallelized envs env = env_factory(args.env_name)() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if hasattr(env, 'seed'): env.seed(args.seed) obs_space = env.observation_space.shape[0] act_space = env.action_space.shape[0] replay_buff = ReplayBuffer(obs_space, act_space, args.timesteps) if args.recurrent: print('Recurrent ', end='') q1 = LSTM_Q(obs_space, act_space, env_name=args.env_name) q2 = LSTM_Q(obs_space, act_space, env_name=args.env_name) if args.algo == 'sac': actor = LSTM_Stochastic_Actor(obs_space, act_space, env_name=args.env_name, bounded=True) else: actor = LSTM_Actor(obs_space, act_space, env_name=args.env_name) else: q1 = FF_Q(obs_space, act_space, env_name=args.env_name) q2 = FF_Q(obs_space, act_space, env_name=args.env_name) if args.algo == 'sac': actor = FF_Stochastic_Actor(obs_space, act_space, env_name=args.env_name, bounded=True) else: actor = FF_Actor(obs_space, act_space, env_name=args.env_name) if args.algo == 'sac': print('Soft Actor-Critic') algo = SAC(actor, q1, q2, torch.prod(torch.Tensor(env.reset().shape)), args) elif args.algo == 'td3': print('Twin-Delayed Deep Deterministic Policy Gradient') algo = TD3(actor, q1, q2, args) elif args.algo == 'ddpg': print('Deep Deterministic Policy Gradient') algo = DDPG(actor, q1, args) print("\tenv: {}".format(args.env_name)) print("\tseed: {}".format(args.seed)) print("\ttimesteps: {:n}".format(args.timesteps)) print("\tactor_lr: {}".format(args.a_lr)) print("\tcritic_lr: {}".format(args.c_lr)) print("\tdiscount: {}".format(args.discount)) print("\ttau: {}".format(args.tau)) print("\tbatch_size: {}".format(args.batch_size)) print("\twarmup period: {:n}".format(args.start_timesteps)) print() iter = 0 episode_reward = 0 episode_timesteps = 0 # create a tensorboard logging object logger = create_logger(args) if args.save_actor is None: args.save_actor = os.path.join(logger.dir, 'actor.pt') # Keep track of some statistics for each episode training_start = time.time() episode_start = time.time() episode_loss = 0 update_steps = 0 best_reward = None #eval_policy(algo.actor, min_timesteps=args.prenormalize_steps, max_traj_len=args.max_traj_len, visualize=False train_normalizer(algo.actor, args.prenormalize_steps, noise=algo.expl_noise) # Fill replay buffer, update policy until n timesteps have passed timesteps = 0 state = env.reset().astype(np.float32) while timesteps < args.timesteps: buffer_ready = (algo.recurrent and replay_buff.trajectories > args.batch_size) or ( not algo.recurrent and replay_buff.size > args.batch_size) warmup = timesteps < args.start_timesteps state, r, done = collect_experience(algo.actor, env, replay_buff, state, episode_timesteps, max_len=args.traj_len, noise=algo.expl_noise) episode_reward += r episode_timesteps += 1 timesteps += 1 if not buffer_ready or warmup: iter = 0 # Update the policy once our replay buffer is big enough if buffer_ready and done and not warmup: update_steps = 0 if not algo.recurrent: num_updates = episode_timesteps else: num_updates = 1 losses = [] for _ in range(num_updates): losses.append( algo.update_policy(replay_buff, args.batch_size, traj_len=args.traj_len)) episode_elapsed = (time.time() - episode_start) episode_secs_per_sample = episode_elapsed / episode_timesteps actor_loss = np.mean([loss[0] for loss in losses]) critic_loss = np.mean([loss[1] for loss in losses]) update_steps = sum([loss[-1] for loss in losses]) logger.add_scalar(args.env_name + '/actor loss', actor_loss, timesteps - args.start_timesteps) logger.add_scalar(args.env_name + '/critic loss', critic_loss, timesteps - args.start_timesteps) logger.add_scalar(args.env_name + '/update steps', update_steps, timesteps - args.start_timesteps) if args.algo == 'sac': alpha_loss = np.mean([loss[2] for loss in losses]) logger.add_scalar(args.env_name + '/alpha loss', alpha_loss, timesteps - args.start_timesteps) completion = 1 - float(timesteps) / args.timesteps avg_sample_r = (time.time() - training_start) / timesteps secs_remaining = avg_sample_r * args.timesteps * completion hrs_remaining = int(secs_remaining // (60 * 60)) min_remaining = int(secs_remaining - hrs_remaining * 60 * 60) // 60 if iter % args.eval_every == 0 and iter != 0: eval_reward = eval_policy(algo.actor, min_timesteps=1000, verbose=False, visualize=False, max_traj_len=args.traj_len) logger.add_scalar(args.env_name + '/return', eval_reward, timesteps - args.start_timesteps) print( "evaluation after {:4d} episodes | return: {:7.3f} | timesteps {:9n}{:100s}" .format(iter, eval_reward, timesteps - args.start_timesteps, '')) if best_reward is None or eval_reward > best_reward: torch.save(algo.actor, args.save_actor) best_reward = eval_reward print("\t(best policy so far! saving to {})".format( args.save_actor)) try: print( "episode {:5d} | episode timestep {:5d}/{:5d} | return {:5.1f} | update timesteps: {:7n} | {:3.1f}s/1k samples | approx. {:3d}h {:02d}m remain\t\t\t\t" .format(iter, episode_timesteps, args.traj_len, episode_reward, update_steps, 1000 * episode_secs_per_sample, hrs_remaining, min_remaining), end='\r') except NameError: pass if done: if hasattr(algo.actor, 'init_hidden_state'): algo.actor.init_hidden_state() episode_start, episode_reward, episode_timesteps, episode_loss = time.time( ), 0, 0, 0 iter += 1
def run_experiment(args): """ The entry point for the dynamics extraction algorithm. """ from util.log import create_logger locale.setlocale(locale.LC_ALL, '') policy = torch.load(args.policy) legacy = 'legacy' if not (hasattr(policy, 'legacy') and policy.legacy == False) else '' env_fn = env_factory(policy.env_name + legacy) layers = [int(x) for x in args.layers.split(',')] env = env_fn() policy.init_hidden_state() policy(torch.tensor(env.reset()).float()) latent_dim = get_hiddens(policy).shape[0] models = [] opts = [] for fn in [env.get_friction, env.get_damping, env.get_mass, env.get_quat]: output_dim = fn().shape[0] model = Model(latent_dim, output_dim, layers=layers) models += [model] opts += [optim.Adam(model.parameters(), lr=args.lr, eps=1e-5)] logger = create_logger(args) best_loss = None actor_dir = os.path.split(args.policy)[0] create_new = True #if os.path.exists(os.path.join(actor_dir, 'test_latents.pt')): if False: x = torch.load(os.path.join(logger.dir, 'train_latents.pt')) test_x = torch.load(os.path.join(logger.dir, 'test_latents.pt')) train_frics = torch.load(os.path.join(logger.dir, 'train_frics.pt')) test_frics = torch.load(os.path.join(logger.dir, 'test_frics.pt')) train_damps = torch.load(os.path.join(logger.dir, 'train_damps.pt')) test_damps = torch.load(os.path.join(logger.dir, 'test_damps.pt')) train_masses = torch.load(os.path.join(logger.dir, 'train_masses.pt')) test_masses = torch.load(os.path.join(logger.dir, 'test_masses.pt')) train_quats = torch.load(os.path.join(logger.dir, 'train_quats.pt')) test_quats = torch.load(os.path.join(logger.dir, 'test_quats.pt')) if args.points > len(x) + len(y): create_new = True else: create_new = False if create_new: if not ray.is_initialized(): if args.redis is not None: ray.init(redis_address=args.redis) else: ray.init(num_cpus=args.workers) print("Collecting {:4d} timesteps of data.".format(args.points)) points_per_worker = max(args.points // args.workers, 1) start = time.time() frics, damps, masses, quats, x = concat( ray.get([ collect_data.remote(policy, points=points_per_worker) for _ in range(args.workers) ])) split = int(0.8 * len(x)) test_x = x[split:] x = x[:split] test_frics = frics[split:] frics = frics[:split] test_damps = damps[split:] damps = damps[:split] test_masses = masses[split:] masses = masses[:split] test_quats = quats[split:] quats = quats[:split] print( "{:3.2f} to collect {} timesteps. Training set is {}, test set is {}" .format(time.time() - start, len(x) + len(test_x), len(x), len(test_x))) torch.save(x, os.path.join(logger.dir, 'train_latents.pt')) torch.save(test_x, os.path.join(logger.dir, 'test_latents.pt')) torch.save(frics, os.path.join(logger.dir, 'train_frics.pt')) torch.save(test_frics, os.path.join(logger.dir, 'test_frics.pt')) torch.save(damps, os.path.join(logger.dir, 'train_damps.pt')) torch.save(test_damps, os.path.join(logger.dir, 'test_damps.pt')) torch.save(masses, os.path.join(logger.dir, 'train_masses.pt')) torch.save(test_masses, os.path.join(logger.dir, 'test_masses.pt')) torch.save(quats, os.path.join(logger.dir, 'train_quats.pt')) torch.save(test_quats, os.path.join(logger.dir, 'test_quats.pt')) for epoch in range(args.epochs): random_indices = SubsetRandomSampler(range(len(x) - 1)) sampler = BatchSampler(random_indices, args.batch_size, drop_last=False) for j, batch_idx in enumerate(sampler): batch_x = x[batch_idx] #.float() #batch_fric = frics[batch_idx] #batch_damp = damps[batch_idx] #batch_mass = masses[batch_idx] #batch_quat = quats[batch_idx] batch = [ frics[batch_idx], damps[batch_idx], masses[batch_idx], quats[batch_idx] ] losses = [] for model, batch_y, opt in zip(models, batch, opts): loss = 0.5 * (batch_y - model(batch_x)).pow(2).mean() opt.zero_grad() loss.backward() opt.step() losses.append(loss.item()) print("Epoch {:3d} batch {:4d}/{:4d} ".format( epoch, j, len(sampler) - 1), end='\r') train_y = [frics, damps, masses, quats] test_y = [test_frics, test_damps, test_masses, test_quats] order = ['friction', 'damping', 'mass', 'slope'] with torch.no_grad(): print("\nEpoch {:3d} losses:".format(epoch)) for model, y_tr, y_te, name in zip(models, train_y, test_y, order): loss_total = 0.5 * (y_tr - model(x)).pow(2).mean().item() preds = model(test_x) test_loss = 0.5 * (y_te - preds).pow(2).mean().item() pce = torch.mean(torch.abs((y_te - preds) / y_te)) err = torch.mean(torch.abs((y_te - preds))) logger.add_scalar(logger.arg_hash + '/' + name + '_loss', test_loss, epoch) logger.add_scalar(logger.arg_hash + '/' + name + '_percenterr', pce, epoch) logger.add_scalar(logger.arg_hash + '/' + name + '_abserr', err, epoch) torch.save(model, os.path.join(logger.dir, name + '_extractor.pt')) print("\t{:16s}: train {:7.6f} test {:7.6f}".format( name, loss_total, test_loss))
def eval_policy(self, policy, args, run_args): def print_input_update(e): print( f"\n\nstance dur.: {e.stance_duration:.2f}\t swing dur.: {e.swing_duration:.2f}\t stance mode: {e.stance_mode}\n" ) if self.live_plot: send = self.plot_pipe.send def isData(): return select.select([sys.stdin], [], [], 0) == ([sys.stdin], [], []) if args.debug: args.stats = False if args.reward is None: args.reward = run_args.reward max_traj_len = args.traj_len visualize = not args.no_viz print("env name: ", run_args.env_name) if run_args.env_name is None: env_name = args.env_name else: env_name = run_args.env_name env = env_factory(args.env_name, command_profile=run_args.command_profile, input_profile=run_args.input_profile, simrate=run_args.simrate, dynamics_randomization=run_args.dyn_random, mirror=run_args.mirror, learn_gains=run_args.learn_gains, reward=run_args.reward, history=run_args.history, no_delta=run_args.no_delta, traj=run_args.traj, ik_baseline=run_args.ik_baseline)() if args.debug: env.debug = True if args.terrain is not None and ".npy" in args.terrain: env.sim = CassieSim("cassie_hfield.xml") hfield_data = np.load( os.path.join("./cassie/cassiemujoco/terrains/", args.terrain)) env.sim.set_hfield_data(hfield_data.flatten()) print(env.reward_func) print() if hasattr(policy, 'init_hidden_state'): policy.init_hidden_state() old_settings = termios.tcgetattr(sys.stdin) orient_add = 0 slowmo = False if visualize: env.render() render_state = True try: tty.setcbreak(sys.stdin.fileno()) state = env.reset_for_test() done = False timesteps = 0 eval_reward = 0 speed = 0.0 side_speed = 0.0 env.update_speed(speed, side_speed) while render_state: if isData(): c = sys.stdin.read(1) if c == 'w': speed += 0.1 elif c == 's': speed -= 0.1 elif c == 'd': side_speed += 0.02 elif c == 'a': side_speed -= 0. elif c == 'j': env.phase_add += .1 # print("Increasing frequency to: {:.1f}".format(env.phase_add)) elif c == 'h': env.phase_add -= .1 # print("Decreasing frequency to: {:.1f}".format(env.phase_add)) elif c == 'l': orient_add -= .1 # print("Increasing orient_add to: ", orient_add) elif c == 'k': orient_add += .1 # print("Decreasing orient_add to: ", orient_add) elif c == 'x': env.swing_duration += .01 print_input_update(env) elif c == 'z': env.swing_duration -= .01 print_input_update(env) elif c == 'v': env.stance_duration += .01 print_input_update(env) elif c == 'c': env.stance_duration -= .01 print_input_update(env) elif c == '1': env.stance_mode = "zero" print_input_update(env) elif c == '2': env.stance_mode = "grounded" print_input_update(env) elif c == '3': env.stance_mode = "aerial" elif c == 'r': state = env.reset() speed = env.speed if hasattr(policy, 'init_hidden_state'): policy.init_hidden_state() print("Resetting environment via env.reset()") elif c == 'p': push = 100 push_dir = 2 force_arr = np.zeros(6) force_arr[push_dir] = push env.sim.apply_force(force_arr) elif c == 't': slowmo = not slowmo print("Slowmo : \n", slowmo) env.update_speed(speed, side_speed) # print(speed) if self.live_plot: send((env.swing_duration, env.stance_duration, env.strict_relaxer, env.stance_mode, env.have_incentive)) if args.stats: print( f"act spd: {env.sim.qvel()[0]:+.2f} cmd speed: {env.speed:+.2f} cmd_sd_spd: {env.side_speed:+.2f} phase add: {env.phase_add:.2f} orient add: {orient_add:+.2f}", end="\r") if hasattr(env, 'simrate'): start = time.time() if (not env.vis.ispaused()): # Update Orientation env.orient_add = orient_add action = policy.forward( torch.Tensor(state), deterministic=True).detach().numpy() state, reward, done, _ = env.step(action) eval_reward += reward timesteps += 1 if visualize: render_state = env.render() if hasattr(env, 'simrate'): end = time.time() delaytime = max(0, env.simrate / 2000 - (end - start)) if slowmo: while (time.time() - end < delaytime * 10): env.render() time.sleep(delaytime) else: time.sleep(delaytime) print("Eval reward: ", eval_reward) finally: termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
import torch, time import numpy as np from pathlib import Path from util.env import env_factory torch.manual_seed(0) np.random.seed(0) policies = [] #for filename in Path('rss2').rglob('actor.pt'): for filename in Path('rss2').rglob('actor.pt'): policies.append(filename) max_traj_len = 1200 visualize = True env = env_factory('cassie-dynamicsrandom-nodelta-stateest-clockbased')() trial_damp = [] trial_mass = [] trial_ipos = [] trial_fric = [] trials = 10 for trial in range(trials): env.reset() trial_damp.append(env.sim.get_dof_damping()) trial_mass.append(env.sim.get_body_mass()) trial_ipos.append(env.sim.get_body_ipos()) trial_fric.append(env.sim.get_ground_friction()) with torch.no_grad(): for policy_name in policies:
def run_experiment(args): from util.env import env_factory, train_normalizer from util.log import create_logger from policies.critic import FF_V, LSTM_V, GRU_V from policies.actor import FF_Stochastic_Actor, LSTM_Stochastic_Actor, GRU_Stochastic_Actor, QBN_GRU_Stochastic_Actor import locale, os locale.setlocale(locale.LC_ALL, '') # wrapper function for creating parallelized envs env_fn = env_factory(args.env) obs_dim = env_fn().observation_space.shape[0] action_dim = env_fn().action_space.shape[0] # Set seeds torch.manual_seed(args.seed) np.random.seed(args.seed) std = torch.ones(action_dim)*args.std layers = [int(x) for x in args.layers.split(',')] if args.arch.lower() == 'lstm': policy = LSTM_Stochastic_Actor(obs_dim, action_dim, env_name=args.env, fixed_std=std, bounded=False, layers=layers) critic = LSTM_V(obs_dim, layers=layers) elif args.arch.lower() == 'gru': policy = GRU_Stochastic_Actor(obs_dim, action_dim, env_name=args.env, fixed_std=std, bounded=False, layers=layers) critic = GRU_V(obs_dim, layers=layers) elif args.arch.lower() == 'qbngru': policy = QBN_GRU_Stochastic_Actor(obs_dim, action_dim, env_name=args.env, fixed_std=std, bounded=False, layers=layers) critic = GRU_V(obs_dim, layers=layers) elif args.arch.lower() == 'ff': policy = FF_Stochastic_Actor(obs_dim, action_dim, env_name=args.env, fixed_std=std, bounded=False, layers=layers) critic = FF_V(obs_dim, layers=layers) else: raise RuntimeError policy.legacy = False env = env_fn() print("Collecting normalization statistics with {} states...".format(args.prenormalize_steps)) train_normalizer(policy, args.prenormalize_steps, max_traj_len=args.traj_len, noise=1) critic.copy_normalizer_stats(policy) policy.train(0) critic.train(0) algo = PPO(policy, critic, env_fn, args) # create a tensorboard logging object if not args.nolog: logger = create_logger(args) else: logger = None if args.save_actor is None and logger is not None: args.save_actor = os.path.join(logger.dir, 'actor.pt') if args.save_critic is None and logger is not None: args.save_critic = os.path.join(logger.dir, 'critic.pt') print() print("Proximal Policy Optimization:") print("\tseed: {}".format(args.seed)) print("\tenv: {}".format(args.env)) print("\ttimesteps: {:n}".format(int(args.timesteps))) print("\titeration steps: {:n}".format(int(args.num_steps))) print("\tprenormalize steps: {}".format(int(args.prenormalize_steps))) print("\ttraj_len: {}".format(args.traj_len)) print("\tdiscount: {}".format(args.discount)) print("\tactor_lr: {}".format(args.a_lr)) print("\tcritic_lr: {}".format(args.c_lr)) print("\tadam eps: {}".format(args.eps)) print("\tentropy coeff: {}".format(args.entropy_coeff)) print("\tgrad clip: {}".format(args.grad_clip)) print("\tbatch size: {}".format(args.batch_size)) print("\tepochs: {}".format(args.epochs)) print("\tworkers: {}".format(args.workers)) print() itr = 0 timesteps = 0 best_reward = None while timesteps < args.timesteps: eval_reward, kl, a_loss, c_loss, m_loss, s_loss, steps, (times) = algo.do_iteration(args.num_steps, args.traj_len, args.epochs, batch_size=args.batch_size, kl_thresh=args.kl, mirror=args.mirror) timesteps += steps print("iter {:4d} | return: {:5.2f} | KL {:5.4f} | ".format(itr, eval_reward, kl, timesteps), end='') if m_loss != 0: print("mirror {:6.5f} | ".format(m_loss), end='') if s_loss != 0: print("sparsity {:6.5f} | ".format(s_loss), end='') print("timesteps {:n}".format(timesteps)) if best_reward is None or eval_reward > best_reward: print("\t(best policy so far! saving to {})".format(args.save_actor)) best_reward = eval_reward if args.save_actor is not None: torch.save(algo.actor, args.save_actor) if args.save_critic is not None: torch.save(algo.critic, args.save_critic) if logger is not None: logger.add_scalar(args.env + '/kl', kl, timesteps) logger.add_scalar(args.env + '/return', eval_reward, timesteps) logger.add_scalar(args.env + '/actor loss', a_loss, timesteps) logger.add_scalar(args.env + '/critic loss', c_loss, timesteps) logger.add_scalar(args.env + '/mirror loss', m_loss, timesteps) logger.add_scalar(args.env + '/sparsity loss', s_loss, timesteps) logger.add_scalar(args.env + '/sample rate', times[0], timesteps) logger.add_scalar(args.env + '/update time', times[1], timesteps) itr += 1 print("Finished ({} of {}).".format(timesteps, args.timesteps))