def generate_demos(env, env_name, model, agent, device, save_dir='evals', episodes=100, temperature=1): os.makedirs(save_dir, exist_ok=True) save_path = save_dir + '/' + model.name + '.log' ''' if os.path.exists(save_path): print('evaluation not completed as %s already exists' % save_dir) return ''' print('') print('evaluating {}'.format(model.name)) model_path = "models/" + env_name + "_25/01050" if env_name == "seaquest": model_path = "models/" + env_name + "_5/00035" agent.load(model_path) logs = [[], []] # steps, return makedirs(save_dir, exist_ok=True) with torch.no_grad(): for i in range(episodes): done = False r = 0 ob = preprocess(env.reset(), env_name) steps = 0 acc_reward = 0 while True: a_act = agent.act(ob, r, done) ob = torch.from_numpy(ob).float().to(device) action = model.act(ob, temperature) #print(a_act, action) ob, r, done, _ = env.step(action) #env.render() ob = preprocess(ob, env_name) acc_reward += r[0] steps += 1 if done: print("steps: {}, return: {}".format(steps, acc_reward)) logs[0] += [steps] logs[1] += [acc_reward] break print('return stats:') print('min: {}'.format(np.min(logs[1]))) print('mean: {}'.format(np.mean(logs[1]))) print('max: {}'.format(np.max(logs[1]))) with open(save_path, 'wb') as f: pickle.dump(logs, f)
def generate_fitness(env, env_name, policy, reward_fn, num_episodes, seed, render=False, softmax=True): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") torch.manual_seed(seed) np.random.seed(seed) tf.set_random_seed(seed) env.unwrapped.envs[0].seed(seed) learning_returns = [] true_returns = [] for i in range(num_episodes): done = False traj = [] gt_rewards = [] r = 0 ob = env.reset() ob_processed = preprocess(ob, env_name) #print(ob_processed.shape) ob_cuda = torch.from_numpy(np.array(ob_processed)).float().to(device) #print(ob_cuda.size()) steps = 0 acc_reward = 0 true_reward = 0 while True: action = policy.select_action(ob_cuda, softmax=softmax) #print(action) ob, r, done, _ = env.step(action) if render: env.render() ob_processed = preprocess(ob, env_name) #print(ob_processed.shape) ob_cuda = torch.from_numpy( np.array(ob_processed)).float().to(device) #print(ob_cuda.size()) #ob_processed = ob_processed[0] #get rid of first dimension ob.shape = (1,84,84,4) steps += 1 #print(reward_fn.predict_reward(ob_cuda).item()) acc_reward += reward_fn.predict_reward(ob_cuda).item() true_reward += r if done or steps > 1000: #TODO: remove this if I can since it will hurt performance if render: print( "rollout: {}, steps: {}, pred return: {}, actual return {}" .format(i, steps, acc_reward, true_reward)) break learning_returns.append(acc_reward) true_returns.append(true_reward) return np.mean(learning_returns), np.mean(true_returns)
def generate_mean_map_noop_demos(env): #add no-op demos done = False traj = [] gt_rewards = [] r = 0 ob = env.reset() steps = 0 acc_reward = 0 while steps < 7000: action = 0 #agent.act(ob, r, done) ob, r, done, _ = env.step(action) ob_processed = preprocess(ob, env_name) #ob_processed = ob_processed[0] #get rid of first dimension ob.shape = (1,84,84,4) traj.append(ob_processed) gt_rewards.append(r[0]) steps += 1 acc_reward += r[0] if done: print("checkpoint: {}, steps: {}, return: {}".format( "noop", steps, acc_reward)) break print("noop traj length", len(traj)) return traj, acc_reward, gt_rewards
def step_wait(self): obs, rews, news, infos = self.venv.step_wait() # obs shape: [num_env,84,84,4] in case of atari games #plt.subplot(1,2,1) #plt.imshow(obs[0][:,:,0]) #crop off top of image #n = 10 #no_score_obs = copy.deepcopy(obs) #obs[:,:n,:,:] = 0 #Need to normalize for my reward function #normed_obs = obs / 255.0 #mask and normalize for input to network normed_obs = preprocess(obs, self.env_name) #plt.subplot(1,2,2) #plt.imshow(normed_obs[0][:,:,0]) #plt.show() #print(traj[0][0][40:60,:,:]) with torch.no_grad(): rews_network = self.reward_net.forward( torch.from_numpy(np.array(normed_obs)).float().to( self.device)).cpu().numpy().squeeze() return obs, rews_network, news, infos
def get_demo_feature_counts(env_name, trajectory, feature_net, max_length): learning_returns = [] fcount_rollouts = [] #keep track of the feature counts for each rollout num_steps = [] f_counts = np.zeros(feature_net.fc2.in_features) steps = 0 for i in range(min(max_length, len(trajectory))): ob = trajectory[i] steps += 1 done = False traj = [] r = 0 ob_processed = preprocess(ob, env_name) phi_s = feature_net.state_feature( torch.from_numpy(ob_processed).float().to( device)).cpu().squeeze().numpy() f_counts += phi_s ave_fcounts = f_counts fcount_rollouts.append(ave_fcounts) #print('ave', ave_fcounts) #print('computed ave', np.mean(np.array(fcount_rollouts), axis=0)) return ave_fcounts, fcount_rollouts, [steps]
def generate_dropout_distribution_noop(env, env_name, agent, dropout_net, num_dropout_samples, device): dropout_returns = np.zeros(num_dropout_samples) true_returns = [] # for checkpoint in checkpoints: episode_count = 1 for i in range(episode_count): done = False traj = [] r = 0 ob = env.reset() steps = 0 acc_reward = 0 while True and steps < 20000: action = 0 #no-op action ob, r, done, _ = env.step(action) ob_processed = preprocess(ob, env_name) #ob_processed = ob_processed #get rid of first dimension ob.shape = (1,84,84,4) ob_processed = torch.from_numpy(ob_processed).float().to(device) for d in range(num_dropout_samples): dropout_returns[d] += dropout_net.cum_return(ob_processed)[0].item() steps += 1 if steps % 1000 == 0: print(steps) acc_reward += r[0] if done: print("noop:, episode: {}, steps: {}, return: {}".format(i, steps,acc_reward)) break true_returns.append(acc_reward) return dropout_returns, true_returns
def eval(self, env, agent): rewards = [] # 100 episodes episode_count = self.num_eval_episodes reward = 0 done = False rewards = [] #writer = open(self.checkpoint_dir + "/" +self.env_name + "_bc_results.txt", 'w') for i in range(int(episode_count)): ob = env.reset() steps = 0 acc_reward = 0 while True: #preprocess the state state = preprocess(ob, env_name) state = np.transpose(state, (0, 3, 1, 2)) if np.random.rand() < self.epsilon_greedy: #print('eps greedy action') action = env.action_space.sample() else: #print('policy action') action = agent.get_action(state) ob, reward, done, _ = env.step(action) steps += 1 acc_reward += reward if done: print("Episode: {}, Steps: {}, Reward: {}".format( i, steps, acc_reward)) #writer.write("{}\n".format(acc_reward[0])) rewards.append(acc_reward) break print("Mean reward is: " + str(np.mean(rewards)))
def generate_ensemble_distribution(env, env_name, agent, model_dir, checkpoint, ensemble, num_rollouts, device): # checkpoints = [] # checkpts = [500] # for i in checkpts: # if i < 10: # checkpoints.append('0000' + str(i)) # elif i < 100: # checkpoints.append('000' + str(i)) # elif i < 1000: # checkpoints.append('00' + str(i)) # elif i < 10000: # checkpoints.append('0' + str(i)) # print(checkpoints) ensemble_returns = [] true_returns = [] # for checkpoint in checkpoints: model_path = model_dir + "/models/" + env_name + "_25/" + checkpoint #if env_name == "seaquest": # model_path = model_dir + "/models/" + env_name + "_5/" + checkpoint agent.load(model_path) episode_count = num_rollouts for i in range(episode_count): done = False traj = [] r = 0 ob = env.reset() steps = 0 acc_reward = 0 while True: action = agent.act(ob, r, done) ob, r, done, _ = env.step(action) ob_processed = preprocess(ob, env_name) ob_processed = ob_processed[ 0] #get rid of first dimension ob.shape = (1,84,84,4) traj.append(ob_processed) steps += 1 acc_reward += r[0] if done: print("checkpoint: {}, episode: {}, steps: {}, return: {}". format(checkpoint, i, steps, acc_reward)) break #now run the traj through the network #convert to pytorch tensor traj_i = np.array(traj) traj_i = torch.from_numpy(traj_i).float().to(device) for ensemble_net in ensemble: cum_ret = ensemble_net.cum_return(traj_i)[0].item() ensemble_returns.append(cum_ret) #print("sample", i, "return = ", cum_ret) #print("traj length", len(traj)) true_returns.append(acc_reward) return ensemble_returns, true_returns
def step_wait(self): obs, rews, news, infos = self.venv.step_wait() normed_obs = preprocess(obs, self.env_name) with torch.no_grad(): rews_network = self.reward_net.forward( torch.from_numpy(np.array(normed_obs)).float().to( self.device)).cpu().numpy().squeeze() return obs, rews_network, news, infos
def generate_dropout_distribution_checkpoint(env, env_name, agent, checkpoint_model_dir, dropout_net, num_rollouts, num_dropout_samples, device, time_limit=100000): dropout_returns = [] true_returns = [] # for checkpoint in checkpoints: model_path = checkpoint_model_dir #if env_name == "seaquest": # model_path = model_dir + "/models/" + env_name + "_5/" + checkpoint agent.load(model_path) episode_count = num_rollouts for i in range(episode_count): dropout_rets = np.zeros(num_dropout_samples) done = False traj = [] r = 0 ob = env.reset() steps = 0 acc_reward = 0 while True and steps < time_limit: action = agent.act(ob, r, done) ob, r, done, _ = env.step(action) ob_processed = preprocess(ob, env_name) #ob_processed = ob_processed #get rid of first dimension ob.shape = (1,84,84,4) ob_processed = torch.from_numpy(ob_processed).float().to(device) for d in range(num_dropout_samples): dropout_rets[d] += dropout_net.cum_return(ob_processed)[0].item() del ob_processed steps += 1 #print(steps) acc_reward += r[0] if done: print("checkpoint: {}, episode: {}, steps: {}, return: {}".format(model_path, i, steps,acc_reward)) break if steps >= time_limit: print("checkpoint: {}, episode: {}, steps: {}, return: {}".format(model_path, i, steps,acc_reward)) true_returns.append(acc_reward) dropout_returns.extend(dropout_rets) return dropout_returns, true_returns
def generate_demos(self, env, agent, epsilon_greedy): print("Generating demos for epsilon=", epsilon_greedy) rewards = [] # 100 episodes episode_count = self.num_eval_episodes reward = 0 done = False rewards = [] cum_steps = [] demos = [] #writer = open(self.checkpoint_dir + "/" +self.env_name + "_bc_results.txt", 'w') for i in range(int(episode_count)): ob = env.reset() steps = 0 acc_reward = 0 traj = [] while True: #preprocess the state state = preprocess(ob, self.env_name) traj.append(state) state = np.transpose(state, (0, 3, 1, 2)) if np.random.rand() < epsilon_greedy: #print('eps greedy action') action = env.action_space.sample() else: #print('policy action') action = agent.get_action(state) ob, reward, done, _ = env.step(action) steps += 1 acc_reward += reward if done: print("Episode: {}, Steps: {}, Reward: {}".format( i, steps, acc_reward)) #writer.write("{}\n".format(acc_reward[0])) rewards.append(acc_reward) cum_steps.append(steps) break print("traj length", len(traj)) demos.append(traj) print("demo len", len(demos)) print("Mean reward is: " + str(np.mean(rewards))) print("Mean step length is: " + str(np.mean(cum_steps))) return demos, rewards
def generate_expert_demos(env, env_name, agent, epsilon_greedy): demonstrations = [] learning_returns = [] learning_rewards = [] model_path = "path_to_model" agent.load(model_path) episode_count = 25 for i in range(episode_count): done = False traj = [] gt_rewards = [] r = 0 ob = env.reset() #traj.append(ob) #print(ob.shape) steps = 0 acc_reward = 0 while True: if np.random.rand() < epsilon_greedy: action = [env.action_space.sample()] else: action = agent.act(ob, r, done) ob_processed = preprocess(ob, env_name) traj.append((ob_processed, action)) ob, r, done, _ = env.step(action) #print(ob.shape) gt_rewards.append(r[0]) steps += 1 acc_reward += r[0] if done or steps > 4000: print("steps: {}, return: {}".format(steps, acc_reward)) break if acc_reward > 300: print("traj length", len(traj)) demonstrations.append(traj) print("demo length", len(demonstrations)) learning_returns.append(acc_reward) learning_rewards.append(gt_rewards) print(np.mean(learning_returns), np.max(learning_returns)) return demonstrations, learning_returns, learning_rewards
def generate_ensemble_distribution_checkpoint(env, env_name, agent, checkpoint_model_dir, ensemble, num_rollouts, device): ensemble_returns = [] true_returns = [] # for checkpoint in checkpoints: model_path = checkpoint_model_dir #if env_name == "seaquest": # model_path = model_dir + "/models/" + env_name + "_5/" + checkpoint agent.load(model_path) episode_count = num_rollouts for i in range(episode_count): ensemble_rets = np.zeros(len(ensemble)) done = False traj = [] r = 0 ob = env.reset() steps = 0 acc_reward = 0 while True: action = agent.act(ob, r, done) ob, r, done, _ = env.step(action) ob_processed = preprocess(ob, env_name) #ob_processed = ob_processed #get rid of first dimension ob.shape = (1,84,84,4) ob_processed = torch.from_numpy(ob_processed).float().to(device) for idx, net in enumerate(ensemble): ensemble_rets[idx] += net.cum_return(ob_processed)[0].item() del ob_processed steps += 1 # print(steps) acc_reward += r[0] if done: print("checkpoint: {}, episode: {}, steps: {}, return: {}". format(model_path, i, steps, acc_reward)) break true_returns.append(acc_reward) ensemble_returns.extend(ensemble_rets) return ensemble_returns, true_returns
def generate_demos(env, env_name, agent, checkpoint_path, num_demos): print("generating demos from checkpoint:", checkpoint_path) demonstrations = [] learning_returns = [] model_path = checkpoint_path agent.load(model_path) episode_count = num_demos for i in range(episode_count): done = False traj = [] gt_rewards = [] r = 0 ob = env.reset() #traj.append(ob) #print(ob.shape) steps = 0 acc_reward = 0 while True: action = agent.act(ob, r, done) ob_processed = preprocess(ob, env_name) #ob_processed = ob_processed[0] #get rid of spurious first dimension ob.shape = (1,84,84,4) traj.append((ob_processed, action)) ob, r, done, _ = env.step(action) #print(ob.shape) gt_rewards.append(r[0]) steps += 1 acc_reward += r[0] if done: print("demo: {}, steps: {}, return: {}".format( i, steps, acc_reward)) break print("traj length", len(traj)) print("demo length", len(demonstrations)) demonstrations.append(traj) learning_returns.append(acc_reward) print("Mean", np.mean(learning_returns), "Max", np.max(learning_returns)) return demonstrations, learning_returns
def generate_dropout_distribution_framestack(env, env_name, framestack_path, dropout_net, num_dropout_samples, device, time_limit=100000): #uses a prerecorded framestack to do return uncertainty analysis on dropout_rets = np.zeros(num_dropout_samples) true_returns = [-1] #TODO: I don't have a way to get true returns yet. Need to grab these from Prabhat's code. Should be able to get from rewards saved #load the framestack trajectory = np.load(framestack_path) for i in range(min(time_limit, len(trajectory))): ob = trajectory[i] ob_processed = preprocess(ob, env_name) ob_processed = torch.from_numpy(ob_processed).float().to(device) for d in range(num_dropout_samples): dropout_rets[d] += dropout_net.cum_return(ob_processed)[0].item() del ob_processed #true_returns.append(acc_reward) #TODO return dropout_rets, true_returns
def generate_noop_demo(self, env): print("Generating demos for noop agent") noop_action = 0 rewards = [] # 100 episodes episode_count = 4 reward = 0 done = False rewards = [] cum_steps = [] demos = [] #writer = open(self.checkpoint_dir + "/" +self.env_name + "_bc_results.txt", 'w') for i in range(int(episode_count)): ob = env.reset() steps = 0 acc_reward = 0 traj = [] while True: #preprocess the state state = preprocess(ob, self.env_name) traj.append(state) state = np.transpose(state, (0, 3, 1, 2)) ob, reward, done, _ = env.step(noop_action) steps += 1 acc_reward += reward if done or steps > 500: print("Episode: {}, Steps: {}, Reward: {}".format( i, steps, acc_reward)) #writer.write("{}\n".format(acc_reward[0])) rewards.append(acc_reward) cum_steps.append(steps) break demos.append(traj) print("Mean reward is: " + str(np.mean(rewards))) print("Mean step length is: " + str(np.mean(cum_steps))) return demos
def get_preprocessed_trajectories(env_name, dataset, data_dir, preprocess_name): """returns an array of trajectories corresponding to what you would get running checkpoints from PPO demonstrations are grayscaled, maxpooled, stacks of 4 with normalized values between 0 and 1 and top section of screen is masked """ print("generating human demos for", env_name) demos = get_sorted_traj_indices(env_name, dataset) human_scores = [] human_demos = [] for indx, score in demos: human_scores.append(score) traj_dir = path.join(data_dir, 'screens', env_name, str(indx)) #print("generating traj from", traj_dir) maxed_traj = MaxSkipAndWarpFrames(traj_dir) stacked_traj = StackFrames(maxed_traj) demo_norm_mask = [] #normalize values to be between 0 and 1 and have top part masked for ob in stacked_traj: demo_norm_mask.append(preprocess(ob, preprocess_name)[0]) human_demos.append(demo_norm_mask) return human_demos, human_scores
reward_net.to(device) for i in range(int(episode_count)): ob = env.reset() steps = 0 acc_reward = 0. pred_acc_reward = 0. while True: if np.random.rand() < 0.01: action = env.action_space.sample() else: action = agent.act(ob, reward, done) #action = env.action_space.sample() ob, reward, done, _ = env.step(action) #predict reward of ob input_ob = preprocess(ob, env_name) input_ob = torch.from_numpy(input_ob).float().to(device) with torch.no_grad(): rpred = reward_net(input_ob).item() if args.render: print("Pred {} vs. True {}".format(rpred, reward)) if abs(rpred) > 0.5: input() env.render() pred_acc_reward += rpred steps += 1 acc_reward += reward if done: print(steps, acc_reward, pred_acc_reward)
def generate_novice_demos(env, env_name, agent, model_dir, debug): if debug: checkpoint_min = 300 checkpoint_max = 400 checkpoint_step = 50 else: checkpoint_min = 50 #50 checkpoint_max = 600 checkpoint_step = 50 #50 checkpoints = [] if env_name == "enduro": checkpoint_min = 3100 checkpoint_max = 3650 elif env_name == "seaquest": checkpoint_min = 10 checkpoint_max = 65 checkpoint_step = 5 for i in range(checkpoint_min, checkpoint_max + checkpoint_step, checkpoint_step): if i < 10: checkpoints.append('0000' + str(i)) elif i < 100: checkpoints.append('000' + str(i)) elif i < 1000: checkpoints.append('00' + str(i)) elif i < 10000: checkpoints.append('0' + str(i)) print(checkpoints) demonstrations = [] learning_returns = [] learning_rewards = [] for checkpoint in checkpoints: model_path = model_dir + env_name + "_25/" + checkpoint if env_name == "seaquest": model_path = model_dir + env_name + "_5/" + checkpoint agent.load(model_path) episode_count = 1 for i in range(episode_count): done = False traj = [] gt_rewards = [] r = 0 ob = env.reset() steps = 0 acc_reward = 0 while True: action = agent.act(ob, r, done) ob, r, done, _ = env.step(action) ob_processed = preprocess(ob, env_name) #ob_processed = ob_processed[0] #get rid of first dimension ob.shape = (1,84,84,4) traj.append(ob_processed) #env.render() gt_rewards.append(r[0]) steps += 1 acc_reward += np.sign(r[0]) if done: print("checkpoint: {}, steps: {}, clipped return: {}, true reward {}".format(checkpoint, steps,acc_reward, np.sum(gt_rewards))) break print("traj length", len(traj)) print("demo length", len(demonstrations)) demonstrations.append(traj) learning_returns.append(acc_reward) learning_rewards.append(gt_rewards) return demonstrations, learning_returns, learning_rewards
def get_policy_feature_counts(env_name, checkpointpath, num_rollouts, fixed_horizon): if env_name == "spaceinvaders": env_id = "SpaceInvaders" elif env_name == "mspacman": env_id = "MsPacman" elif env_name == "videopinball": env_id = "VideoPinball" elif env_name == "beamrider": env_id = "BeamRider" elif env_name == "montezumarevenge": env_id = "MontezumaRevenge" else: env_id = env_name[0].upper() + env_name[1:] if fixed_horizon: env_id += "NoFrameskipFixedHorizon-v0" else: env_id += "NoFrameskip-v4" env_type = "atari" stochastic = True #env id, env type, num envs, and seed env = make_vec_env(env_id, 'atari', 1, 0, wrapper_kwargs={ 'clip_rewards': False, 'episode_life': False, }) env = VecFrameStack(env, 4) agent = PPO2Agent( env, env_type, stochastic) #defaults to stochastic = False (deterministic policy) #agent = RandomAgent(env.action_space) learning_returns = [] print(checkpointpath) agent.load(checkpointpath) episode_count = num_rollouts f_counts = np.zeros(3) #hard coded neg, zero, pos features for i in range(episode_count): done = False traj = [] r = 0 ob = env.reset() #traj.append(ob) #print(ob.shape) steps = 0 acc_reward = 0 while True: action = 0 # NoOp agent.act(ob, r, done) #print(action) ob, r, done, _ = env.step(action) ob_processed = preprocess(ob, env_name) #print(ob_processed.shape) if np.sign(r[0]) == -1: phi_s = np.array([1.0, 0.0, 0.0]) elif np.sign(r[0]) == 0: phi_s = np.array([0.0, 1.0, 0.0]) else: phi_s = np.array([0.0, 0.0, 1.0]) #print(phi_s.shape) f_counts += phi_s steps += 1 #print(steps) acc_reward += r[0] if done: print("steps: {}, return: {}".format(steps, acc_reward)) break learning_returns.append(acc_reward) env.close() #tf.reset_default_graph() ave_fcounts = f_counts / episode_count return learning_returns, ave_fcounts
demonstrator.load(model_path) for i in range(episode_count): done = False traj = [] r = 0 ob = env.reset() #traj.append(ob) #print(ob.shape) steps = 0 acc_reward = 0 while True: action = demonstrator.act(ob, r, done) ob, r, done, _ = env.step(action) ob_processed = preprocess(ob, env_name) ob_processed = ob_processed[0] #get rid of spurious first dimension ob.shape = (1,84,84,4) traj.append(ob_processed) steps += 1 acc_reward += r[0] if done: print("checkpoint: {}, steps: {}, return: {}".format(checkpoint, steps,acc_reward)) break print("traj length", len(traj)) #print("demo length", len(demonstrations)) #demonstrations.append(traj) learning_returns_extrapolate.append(acc_reward) pred_returns_extrapolate.append(reward_net.cum_return(torch.from_numpy(np.array(traj)).float().to(device))[0].item()) print("pred return", pred_returns_extrapolate[-1])
def generate_demos(env, env_name, agent, model_dir, checkpoint_range, save_dir='demos', episodes_per_checkpoint=5, map_increment=1e9): save_path = save_dir + '/' + env_name + '.lmdb' if os.path.exists(save_path): print('Demonstrations not collected as %s already exists' % save_path) return checkpoints = [] for i in checkpoint_range: if i < 10: checkpoints.append('0000' + str(i)) elif i < 100: checkpoints.append('000' + str(i)) elif i < 1000: checkpoints.append('00' + str(i)) elif i < 10000: checkpoints.append('0' + str(i)) print(checkpoints) makedirs(save_dir, exist_ok=True) map_counter = 1 keys = [] with lmdb.open(save_path, map_size=map_counter*map_increment) as lmdb_env: for checkpoint in checkpoints: model_path = model_dir + "/models/" + env_name + "_25/" + checkpoint if env_name == "seaquest": model_path = model_dir + "/models/" + env_name + "_5/" + checkpoint agent.load(model_path) for i in range(episodes_per_checkpoint): done = False traj = [] gt_rewards = [] actions = [] r = 0 ob = env.reset() steps = 0 acc_reward = 0 while True: action = agent.act(ob, r, done) ob, r, done, _ = env.step(action) ob_processed = preprocess(ob, env_name) traj.append(ob_processed) actions.append(action) gt_rewards.append(r[0]) acc_reward += r[0] steps += 1 if done: print("checkpoint: {}, steps: {}, return: {}".format(checkpoint, steps, acc_reward)) break traj = (np.concatenate(traj, axis=0)*255).astype(np.uint8) actions = np.array(actions) gt_rewards = np.array(gt_rewards) value = {'states':traj, 'actions':actions, 'rewards':gt_rewards, 'length':steps, 'return':acc_reward} key = '%s_%s_%d' % (env_name, checkpoint, i) lmdb_env, key = lmdb_submit(key, value, lmdb_env, save_path, map_counter, map_increment) keys += [key] with lmdb_env.begin(write=True) as txn: txn.put(b'__keys__', pickle.dumps(keys)) print('%d total demonstrations gathered' % len(keys))
def generate_mean_map_noop_demos(env, env_name, agent, mean_path, map_path): demonstrations = [] learning_returns = [] learning_rewards = [] for model_path in [map_path]: agent.load(model_path) episode_count = 1 for i in range(episode_count): done = False traj = [] gt_rewards = [] r = 0 ob = env.reset() steps = 0 acc_reward = 0 while steps < 7000: action = agent.act(ob, r, done) ob, r, done, _ = env.step(action) if args.render: env.render() ob_processed = preprocess(ob, env_name) #ob_processed = ob_processed[0] #get rid of first dimension ob.shape = (1,84,84,4) traj.append(ob_processed) gt_rewards.append(r[0]) steps += 1 acc_reward += r[0] if done: break print("checkpoint: {}, steps: {}, return: {}".format( model_path, steps, acc_reward)) print("traj length", len(traj)) print("demo length", len(demonstrations)) demonstrations.append(traj) learning_returns.append(acc_reward) learning_rewards.append(gt_rewards) #add no-op demos done = False traj = [] gt_rewards = [] r = 0 ob = env.reset() steps = 0 acc_reward = 0 while steps < 7000: action = 0 #agent.act(ob, r, done) ob, r, done, _ = env.step(action) ob_processed = preprocess(ob, env_name) #ob_processed = ob_processed[0] #get rid of first dimension ob.shape = (1,84,84,4) traj.append(ob_processed) gt_rewards.append(r[0]) steps += 1 acc_reward += r[0] if done: print("checkpoint: {}, steps: {}, return: {}".format( "noop", steps, acc_reward)) break print("noop traj length", len(traj)) print("demo length", len(demonstrations)) demonstrations.append(traj) learning_returns.append(acc_reward) learning_rewards.append(gt_rewards) return demonstrations, learning_returns, learning_rewards
def get_policy_feature_counts(env_name, checkpointpath, num_rollouts, max_length = 3000): if env_name == "spaceinvaders": env_id = "SpaceInvadersNoFrameskip-v4" elif env_name == "mspacman": env_id = "MsPacmanNoFrameskip-v4" elif env_name == "videopinball": env_id = "VideoPinballNoFrameskip-v4" elif env_name == "beamrider": env_id = "BeamRiderNoFrameskip-v4" elif env_name == "montezumarevenge": env_id = "MontezumaRevengeNoFrameskip-v4" else: env_id = env_name[0].upper() + env_name[1:] + "NoFrameskip-v4" env_type = "atari" stochastic = True #env id, env type, num envs, and seed env = make_vec_env(env_id, 'atari', 1, 0, wrapper_kwargs={ 'clip_rewards':False, 'episode_life':False, }) env = VecFrameStack(env, 4) agent = PPO2Agent(env, env_type, stochastic) #defaults to stochastic = False (deterministic policy) #agent = RandomAgent(env.action_space) learning_returns = [] print(checkpointpath) agent.load(checkpointpath) episode_count = num_rollouts if args.no_term: f_counts = np.zeros(3) #neg, zero, pos clipped rewards else: f_counts = np.zeros(4) for i in range(episode_count): print("epsiode", i) done = False traj = [] r = 0 ob = env.reset() #traj.append(ob) #print(ob.shape) steps = 0 acc_reward = 0 while steps < max_length: if not done: action = agent.act(ob, r, done) #print(action) ob, r, done, _ = env.step(action) ob_processed = preprocess(ob, env_name) #print(ob_processed.shape) if np.sign(r[0]) == -1: if args.no_term: phi_s = np.array([1.0, 0.0, 0.0]) else: phi_s = np.array([1.0, 0.0, 0.0, 0.0]) elif np.sign(r[0]) == 0: if args.no_term: phi_s = np.array([0.0, 1.0, 0.0]) else: phi_s = np.array([0.0, 1.0, 0.0, 0.0]) elif np.sign(r[0]) == 1: if args.no_term: phi_s = np.array([0.0, 0.0, 1.0]) else: phi_s = np.array([0.0, 0.0, 1.0, 0.0]) else: print("error not a valid clipped reward") sys.exit() #print(phi_s.shape) f_counts += phi_s steps += 1 #print(steps) acc_reward += r[0] #if done: # print("steps: {}, return: {}".format(steps,acc_reward)) else: #add in appropriate padding and then break #print("adding padding", max_length - steps) if args.no_term: phi_s = (max_length - steps) * np.array([0.0, 1.0, 0.0]) else: phi_s = (max_length - steps) * np.array([0.0, 0.0, 0.0, 1.0]) f_counts += phi_s #print("f_counts", f_counts) break print("steps: {}, return: {}".format(steps,acc_reward)) learning_returns.append(acc_reward) env.close() #tf.reset_default_graph() del agent del env ave_fcounts = f_counts/episode_count return learning_returns, ave_fcounts
def generate_novice_demos(env, env_name, agent, model_dir): checkpoint_min = 550 checkpoint_max = 600 checkpoint_step = 50 checkpoints = [] if env_name == "enduro": checkpoint_min = 3100 checkpoint_max = 3650 elif env_name == "seaquest": checkpoint_min = 10 checkpoint_max = 65 checkpoint_step = 5 for i in range(checkpoint_min, checkpoint_max + checkpoint_step, checkpoint_step): if i < 10: checkpoints.append('0000' + str(i)) elif i < 100: checkpoints.append('000' + str(i)) elif i < 1000: checkpoints.append('00' + str(i)) elif i < 10000: checkpoints.append('0' + str(i)) #if env_name == "pong": # checkpoints = ['00025','00050','00175','00200','00250','00350','00450','00500','00550','00600','00700','00700'] print(checkpoints) demonstrations = [] learning_returns = [] learning_rewards = [] for checkpoint in checkpoints: model_path = model_dir + "/models/" + env_name + "_25/" + checkpoint if env_name == "seaquest": model_path = model_dir + "/models/" + env_name + "_5/" + checkpoint agent.load(model_path) episode_count = 1 for i in range(episode_count): done = False traj = [] gt_rewards = [] r = 0 ob = env.reset() #traj.append(ob) #print(ob.shape) steps = 0 acc_reward = 0 while True: action = agent.act(ob, r, done) ob_processed = preprocess(ob, env_name) #ob_processed = ob_processed[0] #get rid of spurious first dimension ob.shape = (1,84,84,4) traj.append((ob_processed, action)) ob, r, done, _ = env.step(action) #print(ob.shape) gt_rewards.append(r[0]) steps += 1 acc_reward += r[0] if done: print("checkpoint: {}, steps: {}, return: {}".format( checkpoint, steps, acc_reward)) break print("traj length", len(traj)) print("demo length", len(demonstrations)) demonstrations.append(traj) learning_returns.append(acc_reward) learning_rewards.append(gt_rewards) print(np.mean(learning_returns), np.max(learning_returns)) return demonstrations, learning_returns, learning_rewards
def get_policy_feature_counts(env_name, checkpointpath, feature_net, num_rollouts, max_length, no_op=False): if env_name == "spaceinvaders": env_id = "SpaceInvadersNoFrameskip-v4" elif env_name == "mspacman": env_id = "MsPacmanNoFrameskip-v4" elif env_name == "videopinball": env_id = "VideoPinballNoFrameskip-v4" elif env_name == "beamrider": env_id = "BeamRiderNoFrameskip-v4" elif env_name == "montezumarevenge": env_id = "MontezumaRevengeNoFrameskip-v4" else: env_id = env_name[0].upper() + env_name[1:] + "NoFrameskip-v4" env_type = "atari" stochastic = True #env id, env type, num envs, and seed env = make_vec_env(env_id, 'atari', 1, 0, wrapper_kwargs={ 'clip_rewards': False, 'episode_life': False, }) env = VecFrameStack(env, 4) agent = PPO2Agent( env, env_type, stochastic) #defaults to stochastic = False (deterministic policy) #agent = RandomAgent(env.action_space) learning_returns = [] fcount_rollouts = [] #keep track of the feature counts for each rollout num_steps = [] print("using checkpoint", checkpointpath, "if none then using no-op policy") if not no_op: agent.load(checkpointpath) episode_count = num_rollouts f_counts = np.zeros(feature_net.fc2.in_features) for i in range(episode_count): done = False traj = [] fc_rollout = np.zeros(feature_net.fc2.in_features) r = 0 ob = env.reset() #traj.append(ob) #print(ob.shape) steps = 0 acc_reward = 0 while steps < max_length: if no_op: action = 0 else: action = agent.act(ob, r, done) #print(action) ob, r, done, _ = env.step(action) env.render() ob_processed = preprocess(ob, env_name) #print(ob_processed.shape) phi_s = feature_net.state_feature( torch.from_numpy(ob_processed).float().to( device)).cpu().squeeze().numpy() #print(phi_s.shape) fc_rollout += phi_s f_counts += phi_s steps += 1 #print(steps) acc_reward += r[0] if done: print("didn't run long enough!") break print("steps: {}, return: {}".format(steps, acc_reward)) fcount_rollouts.append(fc_rollout) learning_returns.append(acc_reward) num_steps.append(steps) env.close() #tf.reset_default_graph() ave_fcounts = f_counts / episode_count #print('ave', ave_fcounts) #print('computed ave', np.mean(np.array(fcount_rollouts), axis=0)) return learning_returns, ave_fcounts, fcount_rollouts, num_steps
episode_count = 1 for i in range(episode_count): done = False traj = [] r = 0 ob = env.reset() #traj.append(ob) #print(ob.shape) steps = 0 acc_reward = 0 while True: action = agent.act(ob, r, done) ob, r, done, _ = env.step(action) #print(ob.shape) traj.append(preprocess(ob, env_name)) steps += 1 acc_reward += r[0] if done: print("checkpoint: {}, steps: {}, return: {}".format( checkpoint, steps, acc_reward)) break print("traj length", len(traj)) print("demo length", len(demonstrations)) demonstrations.append(traj) learning_returns_demos.append(acc_reward) pred_returns_demos.append( reward.cum_return( torch.from_numpy(np.array(traj)).float().to(device))[0].item()) print("pred return", pred_returns_demos[-1])
def get_policy_feature_counts(env_name, checkpointpath, feature_net, num_rollouts, add_bias=False): if env_name == "spaceinvaders": env_id = "SpaceInvadersNoFrameskip-v4" elif env_name == "mspacman": env_id = "MsPacmanNoFrameskip-v4" elif env_name == "videopinball": env_id = "VideoPinballNoFrameskip-v4" elif env_name == "beamrider": env_id = "BeamRiderNoFrameskip-v4" elif env_name == "montezumarevenge": env_id = "MontezumaRevengeNoFrameskip-v4" else: env_id = env_name[0].upper() + env_name[1:] + "NoFrameskip-v4" env_type = "atari" stochastic = True #env id, env type, num envs, and seed env = make_vec_env(env_id, 'atari', 1, 0, wrapper_kwargs={ 'clip_rewards':False, 'episode_life':False, }) env = VecFrameStack(env, 4) agent = PPO2Agent(env, env_type, stochastic) #defaults to stochastic = False (deterministic policy) #agent = RandomAgent(env.action_space) learning_returns = [] print(checkpointpath) agent.load(checkpointpath) episode_count = num_rollouts if add_bias: f_counts = np.zeros(feature_net.fc2.in_features + 1) else: f_counts = np.zeros(feature_net.fc2.in_features) for i in range(episode_count): done = False traj = [] r = 0 ob = env.reset() #traj.append(ob) #print(ob.shape) steps = 0 acc_reward = 0 while steps < 7000: action = agent.act(ob, r, done) #print(action) ob, r, done, _ = env.step(action) ob_processed = preprocess(ob, env_name) #print(ob_processed.shape) if add_bias: phi_s = torch.cat((feature_net.state_feature(torch.from_numpy(ob_processed).float().to(device)).cpu().squeeze(), torch.tensor([1.]))).numpy() else: phi_s = feature_net.state_feature(torch.from_numpy(ob_processed).float().to(device)).cpu().squeeze().numpy() #print(phi_s.shape) f_counts += phi_s steps += 1 #print(steps) acc_reward += r[0] if done: print("steps: {}, return: {}".format(steps,acc_reward)) break learning_returns.append(acc_reward) env.close() #tf.reset_default_graph() del agent del env ave_fcounts = f_counts/episode_count return learning_returns, ave_fcounts
def generate_novice_demos(env, env_name, agent, model_dir): checkpoint_min = 50 checkpoint_max = 600 checkpoint_step = 50 checkpoints = [] if env_name == "enduro": checkpoint_min = 3100 checkpoint_max = 3650 """ elif env_name == "seaquest": checkpoint_min = 10 checkpoint_max = 65 checkpoint_step = 5 """ for i in range(checkpoint_min, checkpoint_max + checkpoint_step, checkpoint_step): if i < 10: checkpoints.append('0000' + str(i)) elif i < 100: checkpoints.append('000' + str(i)) elif i < 1000: checkpoints.append('00' + str(i)) elif i < 10000: checkpoints.append('0' + str(i)) print(checkpoints) demonstrations = [] learning_returns = [] learning_rewards = [] for checkpoint in checkpoints: model_path = model_dir + "/models/" + env_name + "_25/" + checkpoint #if env_name == "seaquest": # model_path = model_dir + "/models/" + env_name + "_5/" + checkpoint agent.load(model_path) episode_count = 5 #30 for i in range(episode_count): done = False traj = [] actions = [] gt_rewards = [] r = 0 ob = env.reset() steps = 0 acc_reward = 0 #os.mkdir('images/' + str(checkpoint)) frameno = 0 while True: action = agent.act(ob, r, done) ob, r, done, info = env.step(action) ob_processed = preprocess(ob, env_name) ob_processed = ob_processed[ 0] #get rid of first dimension ob.shape = (1,84,84,4) traj.append(ob_processed) actions.append(action[0]) #save_image(torch.from_numpy(ob_processed).permute(2, 0, 1).reshape(4*84, 84), 'images/' + str(checkpoint) + '/' + str(frameno) + '_action_' + str(action[0]) + '.png') frameno += 1 gt_rewards.append(r[0]) steps += 1 acc_reward += r[0] if done: print("checkpoint: {}, steps: {}, return: {}".format( checkpoint, steps, acc_reward)) break print("traj length", len(traj)) print("demo length", len(demonstrations)) demonstrations.append([traj, actions]) learning_returns.append(acc_reward) learning_rewards.append(gt_rewards) return demonstrations, learning_returns, learning_rewards
def generate_mean_map_noop_demos(env, env_name, agent, mean_path, map_path): demonstrations = [] learning_returns = [] learning_rewards = [] # for model_path in [map_path, mean_path, # '../../learning-rewards-of-learners/learner/models/seaquest_25/00025', # '../../learning-rewards-of-learners/learner/models/seaquest_25/00325', # '../../learning-rewards-of-learners/learner/models/seaquest_25/00800', # '../../learning-rewards-of-learners/learner/models/seaquest_25/01450']: # # agent.load(model_path) # episode_count = 1 # for i in range(episode_count): # done = False # traj = [] # gt_rewards = [] # r = 0 # # ob = env.reset() # steps = 0 # acc_reward = 0 # while steps < 7000: # action = agent.act(ob, r, done) # ob, r, done, _ = env.step(action) # if args.render: # env.render() # ob_processed = preprocess(ob, env_name) # #ob_processed = ob_processed[0] #get rid of first dimension ob.shape = (1,84,84,4) # traj.append(ob_processed) # # gt_rewards.append(r[0]) # steps += 1 # acc_reward += r[0] # if done: # break # print("checkpoint: {}, steps: {}, return: {}".format(model_path, steps,acc_reward)) # # print("traj length", len(traj)) # print("demo length", len(demonstrations)) # demonstrations.append(traj) # learning_returns.append(acc_reward) # learning_rewards.append(gt_rewards) #add no-op demos done = False traj = [] gt_rewards = [] r = 0 ob = env.reset() steps = 0 acc_reward = 0 while steps < 3000: action = 0#agent.act(ob, r, done) ob, r, done, _ = env.step(action) ob_processed = preprocess(ob, env_name) #ob_processed = ob_processed[0] #get rid of first dimension ob.shape = (1,84,84,4) traj.append(ob_processed) gt_rewards.append(r[0]) steps += 1 acc_reward += r[0] if done: print("checkpoint: {}, steps: {}, return: {}".format("noop", steps,acc_reward)) break print("noop traj length", len(traj)) print("demo length", len(demonstrations)) demonstrations.append(traj) learning_returns.append(acc_reward) learning_rewards.append(gt_rewards) return demonstrations, learning_returns, learning_rewards