def evaluate(self): test_env = TimeLimit(gym.make('PepperPush-v0'), max_episode_steps=100) obs = test_env.reset() results = evaluate_policy(self.model, test_env, n_eval_episodes=75, return_episode_rewards=False) return results[0]
def collect_fixed_set_of_states(conf: dict, env: TimeLimit) -> list: # Collect samples to evaluate the agent on a fixed set of samples # (DQN paper). Collect a fixed set of states by running a random policy # before training starts and track the average of the maximum predicted # Q for these states. env.reset() exclude = conf['preprocess']['exclude'] fixed_states = [] while True: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) state = next_state preprocessed_state = preprocess_frame(state, exclude) fixed_states.append(preprocessed_state) if done: break env.close() print(f'Collected {len(fixed_states)} fixed set of states!') return fixed_states
def render_episode(env: TimeLimit, estimator: CNN_DQN): obs = env.reset() state = get_state(obs) is_done = False while not is_done: sleep(0.0415) # (~24 fps) actionIndex = torch.argmax(estimator.predict(state)).item() action = ACTIONS[actionIndex] obs, reward, is_done, info = env.step(action) state = get_state(obs) env.render() env.close()
def run_episode(env: TimeLimit, estimator: CNN_DQN): obs = env.reset() state = get_state(obs) is_done = False total_reward = 0 while not is_done: actionIndex = torch.argmax(estimator.predict(state)).item() action = ACTIONS[actionIndex] obs, reward, is_done, info = env.step(action) state = get_state(obs) total_reward += reward return total_reward
def render_episode(env: TimeLimit, estimator: CNN_DQN): obs = env.reset() state = get_state(obs) is_done = False while not is_done: sleep(0.0415) # (~24 fps) rgb = env.render('rgb_array') upscaled = repeat_upsample(rgb, 3, 4) viewer.imshow(upscaled) actionIndex = torch.argmax(estimator.predict(state)).item() action = ACTIONS[actionIndex] obs, reward, is_done, _ = env.step(action) if reward != 0: print(reward) state = get_state(obs) env.close()
def monte_carlo_control_epsilon_greedy( env: TimeLimit, stats: dict, num_episodes: int, policy: Callable, discount_factor: float = 1.0, max_epsilon: float = 1.0, min_epsilon: float = 0.001, decay_rate: float = 0.00005 ) -> np.ndarray: """ Monte Carlo Control using Epsilon-Greedy policies. Finds the optimal state-action value function. Args: env: OpenAI gym environment stats: Dictionary contains statistics about the experiment num_episodes: Number of episodes to sample policy: Function that returns an action according to a policy discount_factor: Gamma discount factor max_epsilon: Max epsilon value from where the decay starts min_epsilon: Min epsilon value until the decay lasts decay_rate: Rate of the exponential decay Returns: The optimal sate-action value function. """ num_wins = 0 num_actions = env.action_space.n num_states = env.observation_space.n q_table = np.zeros((num_states, num_actions)) returns_sum = defaultdict(float) returns_count = defaultdict(float) epsilon = max_epsilon for i_episode in tqdm(range(num_episodes)): unique_state_action_pairs = set() state_action_pairs_in_episode = [] rewards_in_episode = [] done = False state = env.reset() t = 0 while not done: action = policy(env, q_table, state, epsilon) next_state, reward, done, _ = env.step(action) state_action_pairs_in_episode.append([state, action]) unique_state_action_pairs.add((state, action)) rewards_in_episode.append(reward) state = next_state stats['train/episode_rewards'][i_episode] += reward stats['train/episode_lengths'][i_episode] = t if reward == 1: num_wins += 1 t += 1 state_action_pairs_in_episode = np.array( state_action_pairs_in_episode).astype(int) # Find all (state, action) pairs we've visited in this episode for state_action in unique_state_action_pairs: first_occurrence_idx = np.where( state_action_pairs_in_episode == state_action)[0][0] # Sum up all the rewards since the first occurrence G = sum([r * (discount_factor ** i) for i, r in enumerate(rewards_in_episode[first_occurrence_idx:])]) # Calculate average return for this state over all sampled episodes returns_sum[state_action] += G returns_count[state_action] += 1.0 st, act = state_action q_table[st, act] = ( returns_sum[state_action] / returns_count[state_action] ) stats['train/epsilon'][i_episode] = epsilon epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp( -decay_rate * i_episode) win_ratio = num_wins / (i_episode + 1) stats['train/win_ratio'][i_episode] = win_ratio if i_episode % 5000 == 0 and i_episode > 0: print(f'Current win ratio is {win_ratio}, epsilon: {epsilon}') # The policy is improved implicitly by changing the q_table print(f'Win ratio: {round(num_wins / num_episodes, 5)}') return q_table
def q_learning_control_epsilon_greedy( env: TimeLimit, stats: dict, num_episodes: int, policy: Callable, discount_factor: float = 1.0, learning_rate: float = 0.5, max_epsilon: float = 1.0, min_epsilon: float = 0.001, decay_rate: float = 0.00005 ) -> np.ndarray: """ Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy while following an epsilon-greedy policy Args: env: OpenAI environment stats: Dictionary contains statistics about the experiment num_episodes: Number of episodes to run for policy: Function that returns an action according to a policy discount_factor: Gamma discount factor learning_rate: TD learning rate max_epsilon: Max epsilon value from where the decay starts min_epsilon: Min epsilon value until the decay lasts decay_rate: Rate of the exponential decay Returns: Q table with state-action values """ num_wins = 0 num_actions = env.action_space.n num_states = env.observation_space.n q_table = np.zeros((num_states, num_actions)) epsilon = max_epsilon for i_episode in tqdm(range(num_episodes)): state = env.reset() for t in itertools.count(): action = policy(env, q_table, state, epsilon) next_state, reward, done, info = env.step(action) stats["train/episode_rewards"][i_episode] += reward stats["train/episode_lengths"][i_episode] = t td_target = reward + discount_factor * np.max(q_table[next_state]) td_error = td_target - q_table[state, action] q_table[state, action] += learning_rate * td_error state = next_state if done: if reward == 1: num_wins += 1 break stats["train/epsilon"][i_episode] = epsilon epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp( -decay_rate * i_episode) win_ratio = num_wins / (i_episode + 1) stats['train/win_ratio'][i_episode] = win_ratio if i_episode % 5000 == 0 and i_episode > 0: print(f'Current win ratio is {win_ratio}, epsilon: {epsilon}') return q_table
proj = la.svd(proj, full_matrices=False)[2] enc_dim = proj.shape[0] weights = np.load(p_dir + "weights.npz") biases = np.load(p_dir + "biases.npz") weights = [v for k, v in weights.items()] biases = [v for k, v in biases.items()] saveload_path = "./experiments/learned_controllers/pendulum/{}".format(i) model = DDPG.load(saveload_path + "model") # now let's test the model # specify the test task n_test_steps = 100 # restart the env env = TimeLimit(RestartablePendulumEnv(), max_episode_steps=200) env = EncoderWrapper(env, mlp_encoder, [weights, biases, proj]) # for each test state, start the env in the state, then run forward and collect rewards for k in range(3): high = np.array([np.pi, 1]) start_state = np.random.uniform(low=-high, high=high) obs = env.reset(state=start_state) for j in range(n_test_steps): action, _states = model.predict(obs) obs, reward, dones, info = env.step(action) env.render() # clean up and save results env.close() del model
def main(k): path = './direction_BS_woNorm/150/{}'.format(k) if not os.path.exists(path): os.makedirs(path) ############## Hyperparameters ############## env_name = "fishEvasion-v0" # used when creating the environment with gym.make render = False # render the environment in training if true # solved_reward = 100 # stop training if avg_reward > solved_reward log_interval = 27 # print avg reward in the interval max_episodes = 10000 # max training episodes max_timesteps = 150 # max timesteps in one episode update_timestep = 4050 # update policy every n timesteps action_std = 0.5 # constant std for action distribution (Multivariate Normal) K_epochs = 80 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO gamma = 0.99 # discount factor lr = 0.0003 # parameters for Adam optimizer betas = (0.9, 0.999) random_seed = None ############################################# # creating environment env = fish.FishEvasionEnv(dt = 0.1) # set the length of an episode from gym.wrappers.time_limit import TimeLimit env = TimeLimit(env, max_episode_steps=max_timesteps) # get observation and action dimensions from the environment state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] if random_seed: print("Random Seed: {}".format(random_seed)) torch.manual_seed(random_seed) env.seed(random_seed) np.random.seed(random_seed) memory = Memory() ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip) # ------------------------------------------------------------------ # start training from an existing policy # ppo.policy_old.load_state_dict(torch.load('./direction_policy/PPO_{}_{:06d}.pth'.format(env_name,4380),map_location=device)) # ppo.policy.load_state_dict(torch.load('./direction_policy/PPO_{}_{:06d}.pth'.format(env_name,4380),map_location=device)) # ------------------------------------------------------------------ # logging variables running_reward = 0 avg_length = 0 time_step = 0 # training loop for i_episode in range(1, max_episodes+1): # ------------------------------------------------------------------ # set a specific distribution for beta # beta0 = angle_normalize(i_episode*3,center = 0) # print(beta0) # ------------------------------------------------------------------ state = env.reset() for t in range(max_timesteps): time_step +=1 # Running policy_old: action = ppo.select_action(state, memory) state, reward, done, _ = env.step(action) # Storing reward and is_terminals: memory.rewards.append(reward) memory.is_terminals.append(done) # update if it is time # ------------------------------------------------------------------ if time_step % update_timestep == 0: ppo.update(memory) memory.clear_memory() time_step = 0 # ------------------------------------------------------------------ running_reward += reward if render: env.render() # break if episode ends if done: break avg_length += t # ------------------------------------------------------------------ # stop training if avg_reward > solved_reward # if running_reward > (log_interval*solved_reward): # print("########## Solved! ##########") # torch.save(ppo.policy.state_dict(), './PPO_continuous_forwardWoPos_solved_{}.pth'.format(env_name)) # break # ------------------------------------------------------------------ # save every 50 episodes if i_episode % 50 == 0: torch.save(ppo.policy.state_dict(), path+'/PPO_{}_direction{:06d}.pth'.format(env_name,i_episode)) # ------------------------------------------------------------------ # logging if i_episode % log_interval == 0: avg_length = int(avg_length/log_interval) running_reward = ((running_reward/log_interval)) print('Episode {} \t Avg length: {} \t Avg reward: {}'.format(i_episode, avg_length, running_reward)) running_reward = 0 avg_length = 0
def q_learning(env: TimeLimit, estimator: CNN_DQN, n_episode, target_update_every=10, gamma=1.0, epsilon=0.1, epsilon_decay=0.99, replay_size=32): step = 0 for episode in range(n_episode): policy = estimator.gen_epsilon_greedy_policy(epsilon, n_action) obs = env.reset() state = get_state(obs) is_done = False while not is_done: actionAi = policy(state) actionGym = ACTIONS[actionAi] next_obs, reward, is_done, _ = env.step(actionGym) next_state = get_state(next_obs) total_reward_episode[episode] += reward memory.append((state, actionAi, next_state, reward, is_done)) if is_done: break estimator.replay(memory, replay_size, gamma) state = next_state step += 1 sys.stdout.write(" \r"\ + 'step {}'.format(step)) print('Episode {}: reward: {}, epsilon: {}'.format( episode, total_reward_episode[episode], epsilon)) epsilon = max(epsilon * epsilon_decay, 0.01) if (episode % target_update_every) == 1: # update targets NN estimator.copy_target() estimator.save() if (episode % 100) == 1: # render_episode(env, estimator) # check if NN is well trained total_wins = 0 for test in range(100): total_wins += 1 if run_episode(env, estimator) > 0 else 0 if (total_wins > 90): estimator.copy_target() estimator.save() print('Finished training due to successful model') break
def sarsa(env: TimeLimit, stats: dict, num_episodes: int, policy: Callable, discount_factor: float = 1.0, learning_rate: float = 0.5, max_epsilon: float = 1.0, min_epsilon: float = 0.03, decay_rate: float = 0.00005) -> np.ndarray: """ SARSA algorithm: On-policy TD control. Finds an optimal Q state-action value function. Args: env: OpenAI environment stats: Dictionary contains statistics about the experiment num_episodes: Number of episodes to run for policy: Function that returns an action according to a policy discount_factor: Gamma discount factor learning_rate: TD learning rate max_epsilon: Max epsilon value from where the decay starts min_epsilon: Min epsilon value until the decay lasts decay_rate: Rate of the exponential decay Returns: A tuple (Q, stats). Q is the optimal action-value function, a dictionary mapping state -> action values. stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ num_wins = 0 num_actions = env.action_space.n num_states = env.observation_space.n q_table = np.zeros((num_states, num_actions)) epsilon = max_epsilon for i_episode in tqdm(range(num_episodes)): state = env.reset() action = policy(env, q_table, state, epsilon) for t in itertools.count(): next_state, reward, done, _ = env.step(action) next_action = policy(env, q_table, next_state, epsilon) if reward == 1: num_wins += 1 stats['episode_rewards'][i_episode] += reward stats['episode_lengths'][i_episode] = t td_target = reward + discount_factor * q_table[next_state, next_action] td_delta = td_target - q_table[state, action] q_table[state, action] += learning_rate * td_delta if done: break action = next_action state = next_state epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp( -decay_rate * i_episode) if i_episode % 5000 == 0 and i_episode > 0: print(f"Current win ratio is {num_wins / i_episode}, " f"epsilon: {epsilon}") print(f"Win ratio: {round(num_wins / num_episodes, 5)}") return q_table
def n_step_sarsa(env: TimeLimit, stats: dict, num_episodes: int, policy: Callable, discount_factor: float = 0.9, learning_rate: float = 0.8, max_epsilon: float = 1.0, min_epsilon: float = 0.001, decay_rate: float = 0.00005, n_steps: int = 5): """ N-step Sarsa for estimating Q (N-step bootstrapping). Args: env: OpenAI environment stats: Dictionary contains statistics about the experiment num_episodes: Number of episodes to run for policy: Function that returns an action according to a policy discount_factor: Gamma discount factor learning_rate: TD learning rate max_epsilon: Max epsilon value from where the decay starts min_epsilon: Min epsilon value until the decay lasts decay_rate: Rate of the exponential decay n_steps: Steps to bootstrap Returns: Q table with state-action values """ num_wins = 0 num_actions = env.action_space.n num_states = env.observation_space.n q_table = np.zeros((num_states, num_actions)) epsilon = max_epsilon for i_episode in tqdm(range(num_episodes)): T = np.inf state = env.reset() action = policy(env, q_table, state, epsilon) actions = [action] states = [state] rewards = [0] for t in itertools.count(): if t < T: next_state, reward, done, _ = env.step(action) states.append(next_state) rewards.append(reward) if done: T = t + 1 if reward == 1: num_wins += 1 else: action = epsilon_greedy_policy(env, q_table, state, epsilon) actions.append(action) stats["episode_rewards"][i_episode] += reward stats["episode_lengths"][i_episode] = t # state tau being updated tau = t - n_steps + 1 if tau >= 0: G = 0 for i in range(tau + 1, min(tau + n_steps + 1, T + 1)): G += np.power(discount_factor, i - tau - 1) * rewards[i] if tau + n_steps < T: st = states[tau + n_steps] act = actions[tau + n_steps] G += np.power(discount_factor, n_steps) * q_table[st, act] # update Q values st = states[tau] act = actions[tau] q_table[st, act] += learning_rate * (G - q_table[st, act]) if tau == T - 1: break stats["epsilon"][i_episode] = epsilon # Reduce epsilon (because we need less and less exploration) epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp( -decay_rate * i_episode) if i_episode % 5000 == 0 and i_episode > 0: print(f"Game won {num_wins} times. Current win ratio is " f"{num_wins / i_episode}, epsilon: {epsilon}") print(f"Win ratio: {round(num_wins / num_episodes, 5)}") return q_table
class Worker(object): def __init__(self, name, globalAC, hard_share=None, soft_sharing_coeff_actor=0.0, soft_sharing_coeff_critic=0.0, gradient_clip_actor=0.0, gradient_clip_critic=0.0, debug=False, max_ep_steps=200, image_shape=None, stack=1): self.env = gym.make(GAME).unwrapped self.env = TimeLimit(self.env, max_episode_steps=max_ep_steps) self.name = name self.AC = ACNet(name, globalAC, hard_share=hard_share, soft_sharing_coeff_actor=soft_sharing_coeff_actor, soft_sharing_coeff_critic=soft_sharing_coeff_critic, gradient_clip_actor=gradient_clip_actor, gradient_clip_critic=gradient_clip_critic, image_shape=image_shape, stack=stack) self.debug = debug self.image_shape = image_shape self.stack = stack def work(self): def get_img(fn, *args): img_lock.acquire() results = fn(*args) img = self.env.render(mode='rgb_array') img_lock.release() img = rgb2grey(img) img = resize(img, self.image_shape) return img, results def env_reset_obs(): return self.env.reset() def env_reset_img(): img, _ = get_img(env_reset_obs) return img def env_step_obs(a): return self.env.step(a) def env_step_img(a): img, results = get_img(env_step_obs, a) return img, results[1], results[2], results[3] if self.image_shape is not None: env_reset_fn = env_reset_img env_step_fn = env_step_img else: env_reset_fn = env_reset_obs env_step_fn = env_step_obs global GLOBAL_RUNNING_R, GLOBAL_R, GLOBAL_EP, MAX_GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s = env_reset_fn() buffer_s = [s] * self.stack ep_r = 0 while True: a = self.AC.choose_action(buffer_s[-self.stack:]) s_, r, done, info = env_step_fn(a) if done: r = -5 ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net if done: v_s_ = 0 # terminal else: obs_hist = buffer_s[-(self.stack - 1):] + [ s_, ] feed_dict = { var: obs[np.newaxis, :] for var, obs in zip(self.AC.s, obs_hist) } v_s_ = SESS.run(self.AC.v, feed_dict=feed_dict)[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() if self.image_shape is not None: buffer_s_ = [ buffer_s_[np.newaxis, :] for buffer_s_ in buffer_s ] else: buffer_s_ = copy.deepcopy(buffer_s) obs_columns = [ np.vstack(buffer_s_[idx:-(self.stack - idx)]) for idx in range(self.stack) ] buffer_a, buffer_v_target = np.array(buffer_a), np.vstack( buffer_v_target) feed_dict = { var: obs for var, obs in zip(self.AC.s, obs_columns) } feed_dict[self.AC.a_his] = buffer_a feed_dict[self.AC.v_target] = buffer_v_target if self.debug and self.name == 'W_0': a_loss, c_loss, t_td, c_loss, t_log_prob, t_exp_v, t_entropy, t_exp_v2, a_loss, a_grads, c_grads = self.AC.get_stats( feed_dict) #print("a_loss: ", a_loss.shape, " ", a_loss, "\tc_loss: ", c_loss.shape, " ", c_loss, "\ttd: ", t_td.shape, " ", t_td, "\tlog_prob: ", t_log_prob.shape, " ", t_log_prob, "\texp_v: ", t_exp_v.shape, " ", t_exp_v, "\tentropy: ", t_entropy.shape, " ", t_entropy, "\texp_v2: ", t_exp_v2.shape, " ", t_exp_v2, "\ta_grads: ", [np.sum(weights) for weights in a_grads], "\tc_grads: ", [np.sum(weights) for weights in c_grads]) print("a_loss: ", a_loss.shape, " ", a_loss, "\tc_loss: ", c_loss) c_loss, a_loss, entropy = self.AC.update_global(feed_dict) #import ipdb; ipdb.set_trace() buffer_s, buffer_a, buffer_r = buffer_s[-( self.stack):], [], [] self.AC.pull_global() s = s_ total_step += 1 if done: GLOBAL_R.append(ep_r) if len(GLOBAL_RUNNING_R ) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r) log_lock.acquire() logger.record_tabular("global_ep", GLOBAL_EP) logger.record_tabular("name", self.name) logger.record_tabular("ep_r", ep_r) logger.record_tabular("ep_r_weighted", GLOBAL_RUNNING_R[-1]) logger.record_tabular("c_loss", c_loss) logger.record_tabular("a_loss", a_loss) logger.record_tabular("entropy", entropy) logger.dump_tabular() log_lock.release() GLOBAL_EP += 1 break
plt.subplot(2, 1, 2) plt.plot(np.arange(len(GLOBAL_R)), GLOBAL_R) plt.xlabel('step') plt.ylabel('Total moving reward') if args.log: name = 'plot_' + str(MAX_GLOBAL_EP) + '_sharing_' if args.hard_share is not None: name += 'hard' elif soft_sharing_coeff_actor > 0. or soft_sharing_coeff_critic > 0.: name += 'soft' else: name += 'none' name += '_lra_' + str(lr_a) + '_lrc_' + str(lr_c) + '.png' plt.savefig() else: plt.show() env = gym.make(GAME).unwrapped env = TimeLimit(env, max_episode_steps=args.max_ep_steps) s = env.reset() buffer_s = [s] * args.stack tidx = 0 done = False while tidx < 1000 and not done: a = workers[0].AC.choose_action(buffer_s[-args.stack:]) env.render() s_, r, done, info = env.step(a) s = s_ buffer_s.append(s) tidx += 1
from test_envs.cartpole_continous import CartPoleContinousEnv from gym.wrappers.time_limit import TimeLimit from test_policies.pd import PD import numpy as np import time PD_coeff = np.array([2.0, 1.0, 10.0, 2.0]) policy = PD(PD_coeff) env = TimeLimit(CartPoleContinousEnv(), max_episode_steps=200) state = env.reset() total_reward = total_length = 0 for step in range(10000): action = policy(np.array(state)) state, reward, done, info = env.step(action) total_reward += reward total_length += 1 # env.render() # time.sleep(0.01) if done: state = env.reset() assert total_reward == 200 assert total_length == 200
class ObstacleGoalEnv(VanillaGoalEnv): def __init__(self, args): VanillaGoalEnv.__init__(self, args) env_id = {'FetchPush-v1': 'push'} assert args.env in env_id.keys() MODEL_XML_PATH = os.path.abspath('.') + '/envs/assets/fetch/' + env_id[ args.env] + '_obstacle.xml' if env_id[args.env] in ['push']: initial_qpos = { 'robot0:slide0': 0.405, 'robot0:slide1': 0.48, 'robot0:slide2': 0.0, 'object0:joint': [1.25, 0.53, 0.4, 1., 0., 0., 0.], } self.env = FetchEnv(MODEL_XML_PATH, has_object=True, block_gripper=False, n_substeps=20, gripper_extra_height=0.2, target_in_the_air=True, target_offset=0.0, obj_range=0.15, target_range=0.15, distance_threshold=0.05, initial_qpos=initial_qpos, reward_type='sparse') self.env = TimeLimit( self.env, max_episode_steps=args.timesteps) # A default wrapper of gym. self.render = self.env.render self.get_obs = self.env.env._get_obs self.reset_sim = self.env.env._reset_sim self.env.reset() self.reset() def reset(self): self.reset_ep() self.sim.set_state(self.initial_state) if self.has_object: object_xpos = self.initial_gripper_xpos[:2].copy() random_offset = np.random.uniform( 0.3, 1.0) * self.obj_range * self.args.init_offset object_xpos -= np.array([random_offset, self.obj_range]) object_qpos = self.sim.data.get_joint_qpos('object0:joint') assert object_qpos.shape == (7, ) object_qpos[:2] = object_xpos self.sim.data.set_joint_qpos('object0:joint', object_qpos) self.sim.forward() self.goal = self.generate_goal() self.last_obs = (self.get_obs()).copy() return self.get_obs() def generate_goal(self): return self.env.env._sample_goal() def generate_goal(self): if self.has_object: goal = self.initial_gripper_xpos[:3] + self.target_offset goal[0] += np.random.uniform(-self.target_range, -self.target_range * 0.3) goal[1] += self.target_range goal[2] = self.height_offset + int(self.target_in_the_air) * 0.45 else: goal = self.initial_gripper_xpos[:3] + np.array([ np.random.uniform(-self.target_range, self.target_range), self.target_range, self.target_range ]) return goal.copy()
def main(): # train the policy, then do some tests to get a sense of how it performs for arg in sys.argv: if arg.startswith('--job='): i = int(arg.split('--job=')[1]) - 1 # pull in the encoder params p_dir = "./experiments/extra_train_exps/{}".format(i) proj = np.load(p_dir + "projectors.npz") proj = np.row_stack([v for k, v in proj.items()]) proj = la.svd(proj, full_matrices=False)[2] enc_dim = proj.shape[0] weights = np.load(p_dir + "weights.npz") biases = np.load(p_dir + "biases.npz") weights = [v for k, v in weights.items()] biases = [v for k, v in biases.items()] saveload_path = "./experiments/extra_train_exps/{}".format(i) # train the model # try a few restarts, keep the best best_avg_perf = -np.inf perfs = [] for j in range(5): # set up the environment env = TimeLimit( RestartablePendulumEnv(enc_dim=enc_dim), max_episode_steps=200) # not sure effect of max_episode_steps env = EncoderWrapper(env, mlp_encoder, [weights, biases, proj]) env = DummyVecEnv([lambda: env]) pol = LinearPolicy_MLPCritic pol_args = dict( layers=[64, 64], layer_norm=False ) # this is the architecture for the critic in ddpg, doesn't specify policy model = train_policy_ddpg(env, pol, pol_args, 300000, verbose=0, actor_lr=.5, critic_lr=.001) # clean up env.close() #model = DDPG.load(saveload_path+"model") # now let's test the model # specify the test task n_test_steps = 100 # uniform grid over statespace (20 points) angs = np.linspace(-np.pi, np.pi, 5)[:-1] vels = np.linspace(-1, 1, 5) test_states = np.array(list(itertools.product(angs, vels))) n_test_states = len(angs) * len(vels) performance = np.zeros(n_test_states) # restart the env env = TimeLimit(RestartablePendulumEnv(), max_episode_steps=200) env = EncoderWrapper(env, mlp_encoder, [weights, biases, proj]) # for each test state, start the env in the state, then run forward and collect rewards for k in range(n_test_states): obs = env.reset(state=test_states[k]) rewards = [] for j in range(n_test_steps): action, _states = model.predict(obs) obs, reward, dones, info = env.step(action) rewards.append(reward) #env.render() performance[k] = np.array(rewards).mean() avg_perf = performance.mean() perfs.append(avg_perf) print("average performance of this model:{}".format(avg_perf)) if avg_perf > best_avg_perf: best_avg_perf = avg_perf # specify the path to save the model model.save(saveload_path + "model") np.savetxt(saveload_path + "test_performance.txt", performance) # clean up and save results np.savetxt(saveload_path + "avg_per_runs.txt", np.array(perfs)) env.close() del model