示例#1
0
def generate_demos(env,
                   env_name,
                   model,
                   agent,
                   device,
                   save_dir='evals',
                   episodes=100,
                   temperature=1):
    os.makedirs(save_dir, exist_ok=True)
    save_path = save_dir + '/' + model.name + '.log'
    '''
    if os.path.exists(save_path):
        print('evaluation not completed as %s already exists' % save_dir)
        return
    '''

    print('')
    print('evaluating {}'.format(model.name))

    model_path = "models/" + env_name + "_25/01050"
    if env_name == "seaquest":
        model_path = "models/" + env_name + "_5/00035"

    agent.load(model_path)

    logs = [[], []]  # steps, return
    makedirs(save_dir, exist_ok=True)

    with torch.no_grad():
        for i in range(episodes):
            done = False
            r = 0
            ob = preprocess(env.reset(), env_name)
            steps = 0
            acc_reward = 0
            while True:
                a_act = agent.act(ob, r, done)
                ob = torch.from_numpy(ob).float().to(device)
                action = model.act(ob, temperature)

                #print(a_act, action)

                ob, r, done, _ = env.step(action)
                #env.render()
                ob = preprocess(ob, env_name)
                acc_reward += r[0]
                steps += 1
                if done:
                    print("steps: {}, return: {}".format(steps, acc_reward))
                    logs[0] += [steps]
                    logs[1] += [acc_reward]
                    break

    print('return stats:')
    print('min: {}'.format(np.min(logs[1])))
    print('mean: {}'.format(np.mean(logs[1])))
    print('max: {}'.format(np.max(logs[1])))

    with open(save_path, 'wb') as f:
        pickle.dump(logs, f)
def generate_fitness(env,
                     env_name,
                     policy,
                     reward_fn,
                     num_episodes,
                     seed,
                     render=False,
                     softmax=True):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    torch.manual_seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)
    env.unwrapped.envs[0].seed(seed)

    learning_returns = []
    true_returns = []
    for i in range(num_episodes):
        done = False
        traj = []
        gt_rewards = []
        r = 0

        ob = env.reset()
        ob_processed = preprocess(ob, env_name)
        #print(ob_processed.shape)
        ob_cuda = torch.from_numpy(np.array(ob_processed)).float().to(device)
        #print(ob_cuda.size())

        steps = 0
        acc_reward = 0
        true_reward = 0
        while True:

            action = policy.select_action(ob_cuda, softmax=softmax)
            #print(action)
            ob, r, done, _ = env.step(action)
            if render:
                env.render()
            ob_processed = preprocess(ob, env_name)
            #print(ob_processed.shape)
            ob_cuda = torch.from_numpy(
                np.array(ob_processed)).float().to(device)
            #print(ob_cuda.size())
            #ob_processed = ob_processed[0] #get rid of first dimension ob.shape = (1,84,84,4)

            steps += 1
            #print(reward_fn.predict_reward(ob_cuda).item())
            acc_reward += reward_fn.predict_reward(ob_cuda).item()
            true_reward += r
            if done or steps > 1000:  #TODO: remove this if I can since it will hurt performance
                if render:
                    print(
                        "rollout: {}, steps: {}, pred return: {}, actual return {}"
                        .format(i, steps, acc_reward, true_reward))
                break
        learning_returns.append(acc_reward)
        true_returns.append(true_reward)

    return np.mean(learning_returns), np.mean(true_returns)
示例#3
0
def generate_mean_map_noop_demos(env):

    #add no-op demos
    done = False
    traj = []
    gt_rewards = []
    r = 0

    ob = env.reset()
    steps = 0
    acc_reward = 0
    while steps < 7000:
        action = 0  #agent.act(ob, r, done)
        ob, r, done, _ = env.step(action)
        ob_processed = preprocess(ob, env_name)
        #ob_processed = ob_processed[0] #get rid of first dimension ob.shape = (1,84,84,4)
        traj.append(ob_processed)

        gt_rewards.append(r[0])
        steps += 1
        acc_reward += r[0]
        if done:
            print("checkpoint: {}, steps: {}, return: {}".format(
                "noop", steps, acc_reward))
            break
    print("noop traj length", len(traj))

    return traj, acc_reward, gt_rewards
示例#4
0
    def step_wait(self):
        obs, rews, news, infos = self.venv.step_wait()
        # obs shape: [num_env,84,84,4] in case of atari games
        #plt.subplot(1,2,1)
        #plt.imshow(obs[0][:,:,0])
        #crop off top of image
        #n = 10
        #no_score_obs = copy.deepcopy(obs)
        #obs[:,:n,:,:] = 0

        #Need to normalize for my reward function
        #normed_obs = obs / 255.0
        #mask and normalize for input to network
        normed_obs = preprocess(obs, self.env_name)
        #plt.subplot(1,2,2)
        #plt.imshow(normed_obs[0][:,:,0])
        #plt.show()
        #print(traj[0][0][40:60,:,:])

        with torch.no_grad():
            rews_network = self.reward_net.forward(
                torch.from_numpy(np.array(normed_obs)).float().to(
                    self.device)).cpu().numpy().squeeze()

        return obs, rews_network, news, infos
示例#5
0
def get_demo_feature_counts(env_name, trajectory, feature_net, max_length):
    learning_returns = []
    fcount_rollouts = []  #keep track of the feature counts for each rollout
    num_steps = []

    f_counts = np.zeros(feature_net.fc2.in_features)

    steps = 0

    for i in range(min(max_length, len(trajectory))):
        ob = trajectory[i]
        steps += 1
        done = False
        traj = []
        r = 0
        ob_processed = preprocess(ob, env_name)
        phi_s = feature_net.state_feature(
            torch.from_numpy(ob_processed).float().to(
                device)).cpu().squeeze().numpy()
        f_counts += phi_s

    ave_fcounts = f_counts
    fcount_rollouts.append(ave_fcounts)
    #print('ave', ave_fcounts)
    #print('computed ave', np.mean(np.array(fcount_rollouts), axis=0))
    return ave_fcounts, fcount_rollouts, [steps]
def generate_dropout_distribution_noop(env, env_name, agent, dropout_net, num_dropout_samples, device):

    dropout_returns = np.zeros(num_dropout_samples)
    true_returns = []
    # for checkpoint in checkpoints:

    episode_count = 1
    for i in range(episode_count):
        done = False
        traj = []
        r = 0

        ob = env.reset()
        steps = 0
        acc_reward = 0
        while True and steps < 20000:
            action = 0 #no-op action
            ob, r, done, _ = env.step(action)
            ob_processed = preprocess(ob, env_name)
            #ob_processed = ob_processed #get rid of first dimension ob.shape = (1,84,84,4)
            ob_processed = torch.from_numpy(ob_processed).float().to(device)
            for d in range(num_dropout_samples):
                dropout_returns[d] += dropout_net.cum_return(ob_processed)[0].item()

            steps += 1
            if steps % 1000 == 0:
                print(steps)
            acc_reward += r[0]
            if done:
                print("noop:, episode: {}, steps: {}, return: {}".format(i, steps,acc_reward))
                break
        true_returns.append(acc_reward)


    return dropout_returns, true_returns
示例#7
0
    def eval(self, env, agent):
        rewards = []
        # 100 episodes
        episode_count = self.num_eval_episodes
        reward = 0
        done = False
        rewards = []
        #writer = open(self.checkpoint_dir + "/" +self.env_name + "_bc_results.txt", 'w')
        for i in range(int(episode_count)):
            ob = env.reset()
            steps = 0
            acc_reward = 0
            while True:
                #preprocess the state
                state = preprocess(ob, env_name)
                state = np.transpose(state, (0, 3, 1, 2))
                if np.random.rand() < self.epsilon_greedy:
                    #print('eps greedy action')
                    action = env.action_space.sample()
                else:
                    #print('policy action')
                    action = agent.get_action(state)
                ob, reward, done, _ = env.step(action)
                steps += 1
                acc_reward += reward
                if done:
                    print("Episode: {}, Steps: {}, Reward: {}".format(
                        i, steps, acc_reward))
                    #writer.write("{}\n".format(acc_reward[0]))
                    rewards.append(acc_reward)
                    break

        print("Mean reward is: " + str(np.mean(rewards)))
示例#8
0
def generate_ensemble_distribution(env, env_name, agent, model_dir, checkpoint,
                                   ensemble, num_rollouts, device):
    # checkpoints = []
    # checkpts = [500]
    # for i in checkpts:
    #     if i < 10:
    #         checkpoints.append('0000' + str(i))
    #     elif i < 100:
    #         checkpoints.append('000' + str(i))
    #     elif i < 1000:
    #         checkpoints.append('00' + str(i))
    #     elif i < 10000:
    #         checkpoints.append('0' + str(i))
    # print(checkpoints)

    ensemble_returns = []
    true_returns = []
    # for checkpoint in checkpoints:

    model_path = model_dir + "/models/" + env_name + "_25/" + checkpoint
    #if env_name == "seaquest":
    #    model_path = model_dir + "/models/" + env_name + "_5/" + checkpoint

    agent.load(model_path)
    episode_count = num_rollouts
    for i in range(episode_count):
        done = False
        traj = []
        r = 0

        ob = env.reset()
        steps = 0
        acc_reward = 0
        while True:
            action = agent.act(ob, r, done)
            ob, r, done, _ = env.step(action)
            ob_processed = preprocess(ob, env_name)
            ob_processed = ob_processed[
                0]  #get rid of first dimension ob.shape = (1,84,84,4)
            traj.append(ob_processed)

            steps += 1
            acc_reward += r[0]
            if done:
                print("checkpoint: {}, episode: {}, steps: {}, return: {}".
                      format(checkpoint, i, steps, acc_reward))
                break
        #now run the traj through the network
        #convert to pytorch tensor
        traj_i = np.array(traj)
        traj_i = torch.from_numpy(traj_i).float().to(device)
        for ensemble_net in ensemble:
            cum_ret = ensemble_net.cum_return(traj_i)[0].item()
            ensemble_returns.append(cum_ret)
            #print("sample", i, "return = ", cum_ret)
        #print("traj length", len(traj))
        true_returns.append(acc_reward)

    return ensemble_returns, true_returns
示例#9
0
    def step_wait(self):
        obs, rews, news, infos = self.venv.step_wait()
        normed_obs = preprocess(obs, self.env_name)

        with torch.no_grad():
            rews_network = self.reward_net.forward(
                torch.from_numpy(np.array(normed_obs)).float().to(
                    self.device)).cpu().numpy().squeeze()

        return obs, rews_network, news, infos
def generate_dropout_distribution_checkpoint(env, env_name, agent, checkpoint_model_dir, dropout_net, num_rollouts, num_dropout_samples, device, time_limit=100000):

    dropout_returns = []
    true_returns = []
    # for checkpoint in checkpoints:

    model_path = checkpoint_model_dir
    #if env_name == "seaquest":
    #    model_path = model_dir + "/models/" + env_name + "_5/" + checkpoint

    agent.load(model_path)
    episode_count = num_rollouts
    for i in range(episode_count):
        dropout_rets = np.zeros(num_dropout_samples)
        done = False
        traj = []
        r = 0

        ob = env.reset()
        steps = 0
        acc_reward = 0
        while True and steps < time_limit:
            action = agent.act(ob, r, done)
            ob, r, done, _ = env.step(action)
            ob_processed = preprocess(ob, env_name)
            #ob_processed = ob_processed #get rid of first dimension ob.shape = (1,84,84,4)
            ob_processed = torch.from_numpy(ob_processed).float().to(device)
            for d in range(num_dropout_samples):
                dropout_rets[d] += dropout_net.cum_return(ob_processed)[0].item()

            del ob_processed
            steps += 1
            #print(steps)
            acc_reward += r[0]
            if done:
                print("checkpoint: {}, episode: {}, steps: {}, return: {}".format(model_path, i, steps,acc_reward))
                break
        if steps >= time_limit:
            print("checkpoint: {}, episode: {}, steps: {}, return: {}".format(model_path, i, steps,acc_reward))

        true_returns.append(acc_reward)
        dropout_returns.extend(dropout_rets)


    return dropout_returns, true_returns
    def generate_demos(self, env, agent, epsilon_greedy):
        print("Generating demos for epsilon=", epsilon_greedy)
        rewards = []
        # 100 episodes
        episode_count = self.num_eval_episodes
        reward = 0
        done = False
        rewards = []
        cum_steps = []
        demos = []
        #writer = open(self.checkpoint_dir + "/" +self.env_name + "_bc_results.txt", 'w')
        for i in range(int(episode_count)):
            ob = env.reset()
            steps = 0
            acc_reward = 0
            traj = []
            while True:
                #preprocess the state
                state = preprocess(ob, self.env_name)

                traj.append(state)
                state = np.transpose(state, (0, 3, 1, 2))
                if np.random.rand() < epsilon_greedy:
                    #print('eps greedy action')
                    action = env.action_space.sample()
                else:
                    #print('policy action')
                    action = agent.get_action(state)
                ob, reward, done, _ = env.step(action)
                steps += 1
                acc_reward += reward
                if done:
                    print("Episode: {}, Steps: {}, Reward: {}".format(
                        i, steps, acc_reward))
                    #writer.write("{}\n".format(acc_reward[0]))
                    rewards.append(acc_reward)
                    cum_steps.append(steps)
                    break
            print("traj length", len(traj))
            demos.append(traj)
            print("demo len", len(demos))

        print("Mean reward is: " + str(np.mean(rewards)))
        print("Mean step length is: " + str(np.mean(cum_steps)))
        return demos, rewards
示例#12
0
def generate_expert_demos(env, env_name, agent, epsilon_greedy):

    demonstrations = []
    learning_returns = []
    learning_rewards = []
    model_path = "path_to_model"

    agent.load(model_path)
    episode_count = 25
    for i in range(episode_count):
        done = False
        traj = []
        gt_rewards = []
        r = 0

        ob = env.reset()
        #traj.append(ob)
        #print(ob.shape)
        steps = 0
        acc_reward = 0
        while True:
            if np.random.rand() < epsilon_greedy:
                action = [env.action_space.sample()]
            else:
                action = agent.act(ob, r, done)
            ob_processed = preprocess(ob, env_name)
            traj.append((ob_processed, action))
            ob, r, done, _ = env.step(action)
            #print(ob.shape)

            gt_rewards.append(r[0])
            steps += 1
            acc_reward += r[0]
            if done or steps > 4000:
                print("steps: {}, return: {}".format(steps, acc_reward))
                break
        if acc_reward > 300:
            print("traj length", len(traj))
            demonstrations.append(traj)
            print("demo length", len(demonstrations))
            learning_returns.append(acc_reward)
            learning_rewards.append(gt_rewards)
    print(np.mean(learning_returns), np.max(learning_returns))
    return demonstrations, learning_returns, learning_rewards
示例#13
0
def generate_ensemble_distribution_checkpoint(env, env_name, agent,
                                              checkpoint_model_dir, ensemble,
                                              num_rollouts, device):

    ensemble_returns = []
    true_returns = []
    # for checkpoint in checkpoints:

    model_path = checkpoint_model_dir
    #if env_name == "seaquest":
    #    model_path = model_dir + "/models/" + env_name + "_5/" + checkpoint

    agent.load(model_path)
    episode_count = num_rollouts
    for i in range(episode_count):
        ensemble_rets = np.zeros(len(ensemble))
        done = False
        traj = []
        r = 0

        ob = env.reset()
        steps = 0
        acc_reward = 0
        while True:
            action = agent.act(ob, r, done)
            ob, r, done, _ = env.step(action)
            ob_processed = preprocess(ob, env_name)
            #ob_processed = ob_processed #get rid of first dimension ob.shape = (1,84,84,4)
            ob_processed = torch.from_numpy(ob_processed).float().to(device)
            for idx, net in enumerate(ensemble):
                ensemble_rets[idx] += net.cum_return(ob_processed)[0].item()

            del ob_processed
            steps += 1
            # print(steps)
            acc_reward += r[0]
            if done:
                print("checkpoint: {}, episode: {}, steps: {}, return: {}".
                      format(model_path, i, steps, acc_reward))
                break
        true_returns.append(acc_reward)
        ensemble_returns.extend(ensemble_rets)

    return ensemble_returns, true_returns
示例#14
0
def generate_demos(env, env_name, agent, checkpoint_path, num_demos):
    print("generating demos from checkpoint:", checkpoint_path)

    demonstrations = []
    learning_returns = []

    model_path = checkpoint_path

    agent.load(model_path)
    episode_count = num_demos
    for i in range(episode_count):
        done = False
        traj = []
        gt_rewards = []
        r = 0

        ob = env.reset()
        #traj.append(ob)
        #print(ob.shape)
        steps = 0
        acc_reward = 0
        while True:
            action = agent.act(ob, r, done)
            ob_processed = preprocess(ob, env_name)
            #ob_processed = ob_processed[0] #get rid of spurious first dimension ob.shape = (1,84,84,4)
            traj.append((ob_processed, action))
            ob, r, done, _ = env.step(action)
            #print(ob.shape)

            gt_rewards.append(r[0])
            steps += 1
            acc_reward += r[0]
            if done:
                print("demo: {}, steps: {}, return: {}".format(
                    i, steps, acc_reward))
                break
        print("traj length", len(traj))
        print("demo length", len(demonstrations))
        demonstrations.append(traj)
        learning_returns.append(acc_reward)
    print("Mean", np.mean(learning_returns), "Max", np.max(learning_returns))

    return demonstrations, learning_returns
def generate_dropout_distribution_framestack(env, env_name, framestack_path, dropout_net, num_dropout_samples, device, time_limit=100000):
    #uses a prerecorded framestack to do return uncertainty analysis on
    dropout_rets = np.zeros(num_dropout_samples)
    true_returns = [-1] #TODO: I don't have a way to get true returns yet. Need to grab these from Prabhat's code. Should be able to get from rewards saved

    #load the framestack
    trajectory = np.load(framestack_path)

    for i in range(min(time_limit, len(trajectory))):
        ob = trajectory[i]
        ob_processed = preprocess(ob, env_name)
        ob_processed = torch.from_numpy(ob_processed).float().to(device)
        for d in range(num_dropout_samples):
            dropout_rets[d] += dropout_net.cum_return(ob_processed)[0].item()

        del ob_processed

    #true_returns.append(acc_reward) #TODO



    return dropout_rets, true_returns
    def generate_noop_demo(self, env):
        print("Generating demos for noop agent")

        noop_action = 0

        rewards = []
        # 100 episodes
        episode_count = 4
        reward = 0
        done = False
        rewards = []
        cum_steps = []
        demos = []
        #writer = open(self.checkpoint_dir + "/" +self.env_name + "_bc_results.txt", 'w')
        for i in range(int(episode_count)):
            ob = env.reset()
            steps = 0
            acc_reward = 0
            traj = []
            while True:
                #preprocess the state
                state = preprocess(ob, self.env_name)
                traj.append(state)
                state = np.transpose(state, (0, 3, 1, 2))
                ob, reward, done, _ = env.step(noop_action)
                steps += 1
                acc_reward += reward
                if done or steps > 500:
                    print("Episode: {}, Steps: {}, Reward: {}".format(
                        i, steps, acc_reward))
                    #writer.write("{}\n".format(acc_reward[0]))
                    rewards.append(acc_reward)
                    cum_steps.append(steps)
                    break
            demos.append(traj)

        print("Mean reward is: " + str(np.mean(rewards)))
        print("Mean step length is: " + str(np.mean(cum_steps)))
        return demos
示例#17
0
def get_preprocessed_trajectories(env_name, dataset, data_dir,
                                  preprocess_name):
    """returns an array of trajectories corresponding to what you would get running checkpoints from PPO
       demonstrations are grayscaled, maxpooled, stacks of 4 with normalized values between 0 and 1 and
       top section of screen is masked
    """

    print("generating human demos for", env_name)
    demos = get_sorted_traj_indices(env_name, dataset)
    human_scores = []
    human_demos = []
    for indx, score in demos:
        human_scores.append(score)
        traj_dir = path.join(data_dir, 'screens', env_name, str(indx))
        #print("generating traj from", traj_dir)
        maxed_traj = MaxSkipAndWarpFrames(traj_dir)
        stacked_traj = StackFrames(maxed_traj)
        demo_norm_mask = []
        #normalize values to be between 0 and 1 and have top part masked
        for ob in stacked_traj:
            demo_norm_mask.append(preprocess(ob, preprocess_name)[0])
        human_demos.append(demo_norm_mask)
    return human_demos, human_scores
    reward_net.to(device)

    for i in range(int(episode_count)):
        ob = env.reset()
        steps = 0
        acc_reward = 0.
        pred_acc_reward = 0.
        while True:
            if np.random.rand() < 0.01:
                action = env.action_space.sample()
            else:
                action = agent.act(ob, reward, done)
            #action = env.action_space.sample()
            ob, reward, done, _ = env.step(action)
            #predict reward of ob
            input_ob = preprocess(ob, env_name)
            input_ob = torch.from_numpy(input_ob).float().to(device)
            with torch.no_grad():
                rpred = reward_net(input_ob).item()

            if args.render:
                print("Pred {} vs. True {}".format(rpred, reward))
                if abs(rpred) > 0.5:
                    input()

                env.render()
            pred_acc_reward += rpred
            steps += 1
            acc_reward += reward
            if done:
                print(steps, acc_reward, pred_acc_reward)
示例#19
0
def generate_novice_demos(env, env_name, agent, model_dir, debug):
    if debug:
        checkpoint_min = 300
        checkpoint_max = 400
        checkpoint_step = 50
    else:
        checkpoint_min = 50 #50
        checkpoint_max = 600
        checkpoint_step = 50 #50
    checkpoints = []
    if env_name == "enduro":
        checkpoint_min = 3100
        checkpoint_max = 3650
    elif env_name == "seaquest":
        checkpoint_min = 10
        checkpoint_max = 65
        checkpoint_step = 5
    for i in range(checkpoint_min, checkpoint_max + checkpoint_step, checkpoint_step):
        if i < 10:
            checkpoints.append('0000' + str(i))
        elif i < 100:
            checkpoints.append('000' + str(i))
        elif i < 1000:
            checkpoints.append('00' + str(i))
        elif i < 10000:
            checkpoints.append('0' + str(i))
    print(checkpoints)



    demonstrations = []
    learning_returns = []
    learning_rewards = []
    for checkpoint in checkpoints:

        model_path = model_dir + env_name + "_25/" + checkpoint
        if env_name == "seaquest":
            model_path = model_dir + env_name + "_5/" + checkpoint

        agent.load(model_path)
        episode_count = 1
        for i in range(episode_count):
            done = False
            traj = []
            gt_rewards = []
            r = 0

            ob = env.reset()
            steps = 0
            acc_reward = 0
            while True:
                action = agent.act(ob, r, done)
                ob, r, done, _ = env.step(action)
                ob_processed = preprocess(ob, env_name)
                #ob_processed = ob_processed[0] #get rid of first dimension ob.shape = (1,84,84,4)
                traj.append(ob_processed)
                #env.render()
                gt_rewards.append(r[0])
                steps += 1
                acc_reward += np.sign(r[0])
                if done:
                    print("checkpoint: {}, steps: {}, clipped return: {}, true reward {}".format(checkpoint, steps,acc_reward, np.sum(gt_rewards)))
                    break
            print("traj length", len(traj))
            print("demo length", len(demonstrations))
            demonstrations.append(traj)
            learning_returns.append(acc_reward)
            learning_rewards.append(gt_rewards)

    return demonstrations, learning_returns, learning_rewards
def get_policy_feature_counts(env_name, checkpointpath, num_rollouts,
                              fixed_horizon):
    if env_name == "spaceinvaders":
        env_id = "SpaceInvaders"
    elif env_name == "mspacman":
        env_id = "MsPacman"
    elif env_name == "videopinball":
        env_id = "VideoPinball"
    elif env_name == "beamrider":
        env_id = "BeamRider"
    elif env_name == "montezumarevenge":
        env_id = "MontezumaRevenge"
    else:
        env_id = env_name[0].upper() + env_name[1:]

    if fixed_horizon:
        env_id += "NoFrameskipFixedHorizon-v0"
    else:
        env_id += "NoFrameskip-v4"

    env_type = "atari"

    stochastic = True

    #env id, env type, num envs, and seed
    env = make_vec_env(env_id,
                       'atari',
                       1,
                       0,
                       wrapper_kwargs={
                           'clip_rewards': False,
                           'episode_life': False,
                       })

    env = VecFrameStack(env, 4)

    agent = PPO2Agent(
        env, env_type,
        stochastic)  #defaults to stochastic = False (deterministic policy)
    #agent = RandomAgent(env.action_space)

    learning_returns = []

    print(checkpointpath)

    agent.load(checkpointpath)
    episode_count = num_rollouts

    f_counts = np.zeros(3)  #hard coded neg, zero, pos features

    for i in range(episode_count):
        done = False
        traj = []
        r = 0

        ob = env.reset()
        #traj.append(ob)
        #print(ob.shape)
        steps = 0
        acc_reward = 0
        while True:
            action = 0  # NoOp agent.act(ob, r, done)
            #print(action)
            ob, r, done, _ = env.step(action)
            ob_processed = preprocess(ob, env_name)
            #print(ob_processed.shape)
            if np.sign(r[0]) == -1:
                phi_s = np.array([1.0, 0.0, 0.0])
            elif np.sign(r[0]) == 0:
                phi_s = np.array([0.0, 1.0, 0.0])
            else:
                phi_s = np.array([0.0, 0.0, 1.0])
            #print(phi_s.shape)
            f_counts += phi_s
            steps += 1
            #print(steps)
            acc_reward += r[0]
            if done:
                print("steps: {}, return: {}".format(steps, acc_reward))
                break

        learning_returns.append(acc_reward)

    env.close()
    #tf.reset_default_graph()

    ave_fcounts = f_counts / episode_count

    return learning_returns, ave_fcounts
    demonstrator.load(model_path)

    for i in range(episode_count):
        done = False
        traj = []
        r = 0

        ob = env.reset()
        #traj.append(ob)
        #print(ob.shape)
        steps = 0
        acc_reward = 0
        while True:
            action = demonstrator.act(ob, r, done)
            ob, r, done, _ = env.step(action)
            ob_processed = preprocess(ob, env_name)
            ob_processed = ob_processed[0] #get rid of spurious first dimension ob.shape = (1,84,84,4)
            traj.append(ob_processed)
            steps += 1
            acc_reward += r[0]
            if done:
                print("checkpoint: {}, steps: {}, return: {}".format(checkpoint, steps,acc_reward))
                break
        print("traj length", len(traj))
        #print("demo length", len(demonstrations))
        #demonstrations.append(traj)
        learning_returns_extrapolate.append(acc_reward)
        pred_returns_extrapolate.append(reward_net.cum_return(torch.from_numpy(np.array(traj)).float().to(device))[0].item())
        print("pred return", pred_returns_extrapolate[-1])

示例#22
0
def generate_demos(env, env_name, agent, model_dir, checkpoint_range, save_dir='demos', episodes_per_checkpoint=5, map_increment=1e9):
    save_path = save_dir + '/' + env_name + '.lmdb'
    if os.path.exists(save_path):
        print('Demonstrations not collected as %s already exists' % save_path)
        return

    checkpoints = []
    for i in checkpoint_range:
        if i < 10:
            checkpoints.append('0000' + str(i))
        elif i < 100:
            checkpoints.append('000' + str(i))
        elif i < 1000:
            checkpoints.append('00' + str(i))
        elif i < 10000:
            checkpoints.append('0' + str(i))
    print(checkpoints)

    makedirs(save_dir, exist_ok=True)
    map_counter = 1
    keys = []
    with lmdb.open(save_path, map_size=map_counter*map_increment) as lmdb_env:
        for checkpoint in checkpoints:
            model_path = model_dir + "/models/" + env_name + "_25/" + checkpoint
            if env_name == "seaquest":
                model_path = model_dir + "/models/" + env_name + "_5/" + checkpoint

            agent.load(model_path)
            for i in range(episodes_per_checkpoint):
                done = False
                traj = []
                gt_rewards = []
                actions = []
                r = 0

                ob = env.reset()
                steps = 0
                acc_reward = 0
                while True:
                    action = agent.act(ob, r, done)
                    ob, r, done, _ = env.step(action)
                    ob_processed = preprocess(ob, env_name)
                    traj.append(ob_processed)
                    actions.append(action)

                    gt_rewards.append(r[0])
                    acc_reward += r[0]
                    steps += 1
                    if done:
                        print("checkpoint: {}, steps: {}, return: {}".format(checkpoint, steps, acc_reward))
                        break


                traj = (np.concatenate(traj, axis=0)*255).astype(np.uint8)
                actions = np.array(actions)
                gt_rewards = np.array(gt_rewards)
                value = {'states':traj,
                         'actions':actions,
                         'rewards':gt_rewards,
                         'length':steps,
                         'return':acc_reward}
                key = '%s_%s_%d' % (env_name, checkpoint, i)
                lmdb_env, key = lmdb_submit(key, value, lmdb_env, save_path, map_counter, map_increment)
                keys += [key]
        with lmdb_env.begin(write=True) as txn:
            txn.put(b'__keys__', pickle.dumps(keys))
    print('%d total demonstrations gathered' % len(keys))
示例#23
0
def generate_mean_map_noop_demos(env, env_name, agent, mean_path, map_path):
    demonstrations = []
    learning_returns = []
    learning_rewards = []
    for model_path in [map_path]:

        agent.load(model_path)
        episode_count = 1
        for i in range(episode_count):
            done = False
            traj = []
            gt_rewards = []
            r = 0

            ob = env.reset()
            steps = 0
            acc_reward = 0
            while steps < 7000:
                action = agent.act(ob, r, done)
                ob, r, done, _ = env.step(action)
                if args.render:
                    env.render()
                ob_processed = preprocess(ob, env_name)
                #ob_processed = ob_processed[0] #get rid of first dimension ob.shape = (1,84,84,4)
                traj.append(ob_processed)

                gt_rewards.append(r[0])
                steps += 1
                acc_reward += r[0]
                if done:
                    break
            print("checkpoint: {}, steps: {}, return: {}".format(
                model_path, steps, acc_reward))

            print("traj length", len(traj))
            print("demo length", len(demonstrations))
            demonstrations.append(traj)
            learning_returns.append(acc_reward)
            learning_rewards.append(gt_rewards)

    #add no-op demos
    done = False
    traj = []
    gt_rewards = []
    r = 0

    ob = env.reset()
    steps = 0
    acc_reward = 0
    while steps < 7000:
        action = 0  #agent.act(ob, r, done)
        ob, r, done, _ = env.step(action)
        ob_processed = preprocess(ob, env_name)
        #ob_processed = ob_processed[0] #get rid of first dimension ob.shape = (1,84,84,4)
        traj.append(ob_processed)

        gt_rewards.append(r[0])
        steps += 1
        acc_reward += r[0]
        if done:
            print("checkpoint: {}, steps: {}, return: {}".format(
                "noop", steps, acc_reward))
            break
    print("noop traj length", len(traj))
    print("demo length", len(demonstrations))
    demonstrations.append(traj)
    learning_returns.append(acc_reward)
    learning_rewards.append(gt_rewards)

    return demonstrations, learning_returns, learning_rewards
示例#24
0
def get_policy_feature_counts(env_name, checkpointpath, num_rollouts, max_length = 3000):
    if env_name == "spaceinvaders":
        env_id = "SpaceInvadersNoFrameskip-v4"
    elif env_name == "mspacman":
        env_id = "MsPacmanNoFrameskip-v4"
    elif env_name == "videopinball":
        env_id = "VideoPinballNoFrameskip-v4"
    elif env_name == "beamrider":
        env_id = "BeamRiderNoFrameskip-v4"
    elif env_name == "montezumarevenge":
        env_id = "MontezumaRevengeNoFrameskip-v4"
    else:
        env_id = env_name[0].upper() + env_name[1:] + "NoFrameskip-v4"

    env_type = "atari"

    stochastic = True

    #env id, env type, num envs, and seed
    env = make_vec_env(env_id, 'atari', 1, 0,
                       wrapper_kwargs={
                           'clip_rewards':False,
                           'episode_life':False,
                       })



    env = VecFrameStack(env, 4)


    agent = PPO2Agent(env, env_type, stochastic)  #defaults to stochastic = False (deterministic policy)
    #agent = RandomAgent(env.action_space)

    learning_returns = []

    print(checkpointpath)

    agent.load(checkpointpath)
    episode_count = num_rollouts

    if args.no_term:
        f_counts = np.zeros(3)  #neg, zero, pos clipped rewards
    else:
        f_counts = np.zeros(4)

    for i in range(episode_count):
        print("epsiode", i)
        done = False
        traj = []
        r = 0

        ob = env.reset()
        #traj.append(ob)
        #print(ob.shape)
        steps = 0
        acc_reward = 0
        while steps < max_length:
            if not done:
                action = agent.act(ob, r, done)
                #print(action)
                ob, r, done, _ = env.step(action)
                ob_processed = preprocess(ob, env_name)
                #print(ob_processed.shape)
                if np.sign(r[0]) == -1:
                    if args.no_term:
                        phi_s = np.array([1.0, 0.0, 0.0])
                    else:
                        phi_s = np.array([1.0, 0.0, 0.0, 0.0])
                elif np.sign(r[0]) == 0:
                    if args.no_term:
                        phi_s = np.array([0.0, 1.0, 0.0])
                    else:
                        phi_s = np.array([0.0, 1.0, 0.0, 0.0])
                elif np.sign(r[0]) == 1:
                    if args.no_term:
                        phi_s = np.array([0.0, 0.0, 1.0])
                    else:
                        phi_s = np.array([0.0, 0.0, 1.0, 0.0])
                else:
                    print("error not a valid clipped reward")
                    sys.exit()
                #print(phi_s.shape)
                f_counts += phi_s
                steps += 1
                #print(steps)
                acc_reward += r[0]
                #if done:
                #    print("steps: {}, return: {}".format(steps,acc_reward))
            else:
                #add in appropriate padding and then break
                #print("adding padding", max_length - steps)
                if args.no_term:
                    phi_s = (max_length - steps) * np.array([0.0, 1.0, 0.0])
                else:
                    phi_s = (max_length - steps) * np.array([0.0, 0.0, 0.0, 1.0])
                f_counts += phi_s
                #print("f_counts", f_counts)

                break
        print("steps: {}, return: {}".format(steps,acc_reward))

        learning_returns.append(acc_reward)



    env.close()
    #tf.reset_default_graph()
    del agent
    del env

    ave_fcounts = f_counts/episode_count

    return learning_returns, ave_fcounts
示例#25
0
def generate_novice_demos(env, env_name, agent, model_dir):
    checkpoint_min = 550
    checkpoint_max = 600
    checkpoint_step = 50
    checkpoints = []
    if env_name == "enduro":
        checkpoint_min = 3100
        checkpoint_max = 3650
    elif env_name == "seaquest":
        checkpoint_min = 10
        checkpoint_max = 65
        checkpoint_step = 5
    for i in range(checkpoint_min, checkpoint_max + checkpoint_step,
                   checkpoint_step):
        if i < 10:
            checkpoints.append('0000' + str(i))
        elif i < 100:
            checkpoints.append('000' + str(i))
        elif i < 1000:
            checkpoints.append('00' + str(i))
        elif i < 10000:
            checkpoints.append('0' + str(i))
    #if env_name == "pong":
    #    checkpoints = ['00025','00050','00175','00200','00250','00350','00450','00500','00550','00600','00700','00700']
    print(checkpoints)

    demonstrations = []
    learning_returns = []
    learning_rewards = []
    for checkpoint in checkpoints:

        model_path = model_dir + "/models/" + env_name + "_25/" + checkpoint
        if env_name == "seaquest":
            model_path = model_dir + "/models/" + env_name + "_5/" + checkpoint

        agent.load(model_path)
        episode_count = 1
        for i in range(episode_count):
            done = False
            traj = []
            gt_rewards = []
            r = 0

            ob = env.reset()
            #traj.append(ob)
            #print(ob.shape)
            steps = 0
            acc_reward = 0
            while True:
                action = agent.act(ob, r, done)
                ob_processed = preprocess(ob, env_name)
                #ob_processed = ob_processed[0] #get rid of spurious first dimension ob.shape = (1,84,84,4)
                traj.append((ob_processed, action))
                ob, r, done, _ = env.step(action)
                #print(ob.shape)

                gt_rewards.append(r[0])
                steps += 1
                acc_reward += r[0]
                if done:
                    print("checkpoint: {}, steps: {}, return: {}".format(
                        checkpoint, steps, acc_reward))
                    break
            print("traj length", len(traj))
            print("demo length", len(demonstrations))
            demonstrations.append(traj)
            learning_returns.append(acc_reward)
            learning_rewards.append(gt_rewards)
    print(np.mean(learning_returns), np.max(learning_returns))
    return demonstrations, learning_returns, learning_rewards
def get_policy_feature_counts(env_name,
                              checkpointpath,
                              feature_net,
                              num_rollouts,
                              max_length,
                              no_op=False):
    if env_name == "spaceinvaders":
        env_id = "SpaceInvadersNoFrameskip-v4"
    elif env_name == "mspacman":
        env_id = "MsPacmanNoFrameskip-v4"
    elif env_name == "videopinball":
        env_id = "VideoPinballNoFrameskip-v4"
    elif env_name == "beamrider":
        env_id = "BeamRiderNoFrameskip-v4"
    elif env_name == "montezumarevenge":
        env_id = "MontezumaRevengeNoFrameskip-v4"
    else:
        env_id = env_name[0].upper() + env_name[1:] + "NoFrameskip-v4"

    env_type = "atari"

    stochastic = True

    #env id, env type, num envs, and seed
    env = make_vec_env(env_id,
                       'atari',
                       1,
                       0,
                       wrapper_kwargs={
                           'clip_rewards': False,
                           'episode_life': False,
                       })

    env = VecFrameStack(env, 4)

    agent = PPO2Agent(
        env, env_type,
        stochastic)  #defaults to stochastic = False (deterministic policy)

    #agent = RandomAgent(env.action_space)

    learning_returns = []
    fcount_rollouts = []  #keep track of the feature counts for each rollout
    num_steps = []

    print("using checkpoint", checkpointpath,
          "if none then using no-op policy")
    if not no_op:
        agent.load(checkpointpath)
    episode_count = num_rollouts

    f_counts = np.zeros(feature_net.fc2.in_features)

    for i in range(episode_count):
        done = False
        traj = []
        fc_rollout = np.zeros(feature_net.fc2.in_features)
        r = 0

        ob = env.reset()
        #traj.append(ob)
        #print(ob.shape)
        steps = 0
        acc_reward = 0
        while steps < max_length:
            if no_op:
                action = 0
            else:
                action = agent.act(ob, r, done)
            #print(action)
            ob, r, done, _ = env.step(action)
            env.render()
            ob_processed = preprocess(ob, env_name)
            #print(ob_processed.shape)
            phi_s = feature_net.state_feature(
                torch.from_numpy(ob_processed).float().to(
                    device)).cpu().squeeze().numpy()
            #print(phi_s.shape)
            fc_rollout += phi_s
            f_counts += phi_s
            steps += 1
            #print(steps)
            acc_reward += r[0]
            if done:
                print("didn't run long enough!")
                break
        print("steps: {}, return: {}".format(steps, acc_reward))

        fcount_rollouts.append(fc_rollout)
        learning_returns.append(acc_reward)
        num_steps.append(steps)

    env.close()
    #tf.reset_default_graph()

    ave_fcounts = f_counts / episode_count
    #print('ave', ave_fcounts)
    #print('computed ave', np.mean(np.array(fcount_rollouts), axis=0))
    return learning_returns, ave_fcounts, fcount_rollouts, num_steps
    episode_count = 1
    for i in range(episode_count):
        done = False
        traj = []
        r = 0

        ob = env.reset()
        #traj.append(ob)
        #print(ob.shape)
        steps = 0
        acc_reward = 0
        while True:
            action = agent.act(ob, r, done)
            ob, r, done, _ = env.step(action)
            #print(ob.shape)
            traj.append(preprocess(ob, env_name))
            steps += 1
            acc_reward += r[0]
            if done:
                print("checkpoint: {}, steps: {}, return: {}".format(
                    checkpoint, steps, acc_reward))
                break
        print("traj length", len(traj))
        print("demo length", len(demonstrations))

        demonstrations.append(traj)
        learning_returns_demos.append(acc_reward)
        pred_returns_demos.append(
            reward.cum_return(
                torch.from_numpy(np.array(traj)).float().to(device))[0].item())
        print("pred return", pred_returns_demos[-1])
def get_policy_feature_counts(env_name, checkpointpath, feature_net, num_rollouts, add_bias=False):
    if env_name == "spaceinvaders":
        env_id = "SpaceInvadersNoFrameskip-v4"
    elif env_name == "mspacman":
        env_id = "MsPacmanNoFrameskip-v4"
    elif env_name == "videopinball":
        env_id = "VideoPinballNoFrameskip-v4"
    elif env_name == "beamrider":
        env_id = "BeamRiderNoFrameskip-v4"
    elif env_name == "montezumarevenge":
        env_id = "MontezumaRevengeNoFrameskip-v4"
    else:
        env_id = env_name[0].upper() + env_name[1:] + "NoFrameskip-v4"

    env_type = "atari"

    stochastic = True

    #env id, env type, num envs, and seed
    env = make_vec_env(env_id, 'atari', 1, 0,
                       wrapper_kwargs={
                           'clip_rewards':False,
                           'episode_life':False,
                       })



    env = VecFrameStack(env, 4)


    agent = PPO2Agent(env, env_type, stochastic)  #defaults to stochastic = False (deterministic policy)
    #agent = RandomAgent(env.action_space)

    learning_returns = []

    print(checkpointpath)

    agent.load(checkpointpath)
    episode_count = num_rollouts

    if add_bias:
        f_counts = np.zeros(feature_net.fc2.in_features + 1)
    else:
        f_counts = np.zeros(feature_net.fc2.in_features)

    for i in range(episode_count):
        done = False
        traj = []
        r = 0

        ob = env.reset()
        #traj.append(ob)
        #print(ob.shape)
        steps = 0
        acc_reward = 0
        while steps < 7000:
            action = agent.act(ob, r, done)
            #print(action)
            ob, r, done, _ = env.step(action)
            ob_processed = preprocess(ob, env_name)
            #print(ob_processed.shape)
            if add_bias:
                phi_s = torch.cat((feature_net.state_feature(torch.from_numpy(ob_processed).float().to(device)).cpu().squeeze(), torch.tensor([1.]))).numpy()
            else:
                phi_s = feature_net.state_feature(torch.from_numpy(ob_processed).float().to(device)).cpu().squeeze().numpy()
            #print(phi_s.shape)
            f_counts += phi_s
            steps += 1
            #print(steps)
            acc_reward += r[0]
            if done:
                print("steps: {}, return: {}".format(steps,acc_reward))
                break

        learning_returns.append(acc_reward)



    env.close()
    #tf.reset_default_graph()
    del agent
    del env

    ave_fcounts = f_counts/episode_count

    return learning_returns, ave_fcounts
示例#29
0
def generate_novice_demos(env, env_name, agent, model_dir):
    checkpoint_min = 50
    checkpoint_max = 600
    checkpoint_step = 50
    checkpoints = []
    if env_name == "enduro":
        checkpoint_min = 3100
        checkpoint_max = 3650
    """
    elif env_name == "seaquest":
        checkpoint_min = 10
        checkpoint_max = 65
        checkpoint_step = 5
    """
    for i in range(checkpoint_min, checkpoint_max + checkpoint_step,
                   checkpoint_step):
        if i < 10:
            checkpoints.append('0000' + str(i))
        elif i < 100:
            checkpoints.append('000' + str(i))
        elif i < 1000:
            checkpoints.append('00' + str(i))
        elif i < 10000:
            checkpoints.append('0' + str(i))
    print(checkpoints)

    demonstrations = []
    learning_returns = []
    learning_rewards = []
    for checkpoint in checkpoints:

        model_path = model_dir + "/models/" + env_name + "_25/" + checkpoint
        #if env_name == "seaquest":
        #    model_path = model_dir + "/models/" + env_name + "_5/" + checkpoint

        agent.load(model_path)
        episode_count = 5  #30
        for i in range(episode_count):
            done = False
            traj = []
            actions = []
            gt_rewards = []
            r = 0

            ob = env.reset()
            steps = 0
            acc_reward = 0
            #os.mkdir('images/' + str(checkpoint))
            frameno = 0
            while True:
                action = agent.act(ob, r, done)
                ob, r, done, info = env.step(action)
                ob_processed = preprocess(ob, env_name)
                ob_processed = ob_processed[
                    0]  #get rid of first dimension ob.shape = (1,84,84,4)
                traj.append(ob_processed)
                actions.append(action[0])
                #save_image(torch.from_numpy(ob_processed).permute(2, 0, 1).reshape(4*84, 84), 'images/' + str(checkpoint) + '/' + str(frameno) + '_action_' + str(action[0]) + '.png')
                frameno += 1

                gt_rewards.append(r[0])
                steps += 1
                acc_reward += r[0]
                if done:
                    print("checkpoint: {}, steps: {}, return: {}".format(
                        checkpoint, steps, acc_reward))
                    break
            print("traj length", len(traj))
            print("demo length", len(demonstrations))
            demonstrations.append([traj, actions])
            learning_returns.append(acc_reward)
            learning_rewards.append(gt_rewards)

    return demonstrations, learning_returns, learning_rewards
示例#30
0
def generate_mean_map_noop_demos(env, env_name, agent, mean_path, map_path):
    demonstrations = []
    learning_returns = []
    learning_rewards = []
    # for model_path in [map_path, mean_path,
    #     '../../learning-rewards-of-learners/learner/models/seaquest_25/00025',
    #     '../../learning-rewards-of-learners/learner/models/seaquest_25/00325',
    #     '../../learning-rewards-of-learners/learner/models/seaquest_25/00800',
    #     '../../learning-rewards-of-learners/learner/models/seaquest_25/01450']:
    #
    #     agent.load(model_path)
    #     episode_count = 1
    #     for i in range(episode_count):
    #         done = False
    #         traj = []
    #         gt_rewards = []
    #         r = 0
    #
    #         ob = env.reset()
    #         steps = 0
    #         acc_reward = 0
    #         while steps < 7000:
    #             action = agent.act(ob, r, done)
    #             ob, r, done, _ = env.step(action)
    #             if args.render:
    #                 env.render()
    #             ob_processed = preprocess(ob, env_name)
    #             #ob_processed = ob_processed[0] #get rid of first dimension ob.shape = (1,84,84,4)
    #             traj.append(ob_processed)
    #
    #             gt_rewards.append(r[0])
    #             steps += 1
    #             acc_reward += r[0]
    #             if done:
    #                 break
    #         print("checkpoint: {}, steps: {}, return: {}".format(model_path, steps,acc_reward))
    #
    #         print("traj length", len(traj))
    #         print("demo length", len(demonstrations))
    #         demonstrations.append(traj)
    #         learning_returns.append(acc_reward)
    #         learning_rewards.append(gt_rewards)

    #add no-op demos
    done = False
    traj = []
    gt_rewards = []
    r = 0

    ob = env.reset()
    steps = 0
    acc_reward = 0
    while steps < 3000:
        action = 0#agent.act(ob, r, done)
        ob, r, done, _ = env.step(action)
        ob_processed = preprocess(ob, env_name)
        #ob_processed = ob_processed[0] #get rid of first dimension ob.shape = (1,84,84,4)
        traj.append(ob_processed)

        gt_rewards.append(r[0])
        steps += 1
        acc_reward += r[0]
        if done:
            print("checkpoint: {}, steps: {}, return: {}".format("noop", steps,acc_reward))
            break
    print("noop traj length", len(traj))
    print("demo length", len(demonstrations))
    demonstrations.append(traj)
    learning_returns.append(acc_reward)
    learning_rewards.append(gt_rewards)


    return demonstrations, learning_returns, learning_rewards