Exemplo n.º 1
0
                       seed,
                       wrapper_kwargs={
                           'clip_rewards': False,
                           'episode_life': False,
                       })

    env = VecFrameStack(env, 4)
    agent = PPO2Agent(env, env_type, stochastic)

    demonstrations, learning_returns, learning_rewards = generate_mean_map_noop_demos(
        env)

    # Now we download a pretrained network to form \phi(s) the state features where the reward is now w^T \phi(s)
    print("loading policy", args.pretrained_network)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    reward_net = EmbeddingNet(args.encoding_dims)
    reward_net.load_state_dict(
        torch.load(args.pretrained_network, map_location=device))
    #reinitialize last layer
    num_features = reward_net.fc2.in_features

    print("reward is linear combination of ", num_features, "features")
    reward_net.fc2 = nn.Linear(
        num_features, 1,
        bias=False)  #last layer just outputs the scalar reward = w^T \phi(s)
    reward_net.to(device)
    #freeze all weights so there are no gradients (we'll manually update the last layer via proposals so no grads required)
    for param in reward_net.parameters():
        param.requires_grad = False

    #get num_demos by num_features + 1 (bias) numpy array with (un-discounted) feature counts from pretrained network
    demonstrations = [x for _, x in sorted(zip(learning_returns,demonstrations), key=lambda pair: pair[0])]

    sorted_returns = sorted(learning_returns)
    print(sorted_returns)
    print("lengths")
    print([len(d) for d in demonstrations])


    # Now we download a pretrained network to form \phi(s) the state features where the reward is now w^T \phi(s)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if args.trex:
        print("using TREX network from ICML")
        reward_net = Net()
    else:
        reward_net = EmbeddingNet(args.encoding_dims)
    reward_net.load_state_dict(torch.load(args.pretrained_network, map_location=device))
    #reinitialize last layer
    num_features = reward_net.fc2.in_features

    print("reward is linear combination of ", num_features, "features")
    reward_net.to(device)
    #freeze all weights so there are no gradients (we'll manually update the last layer via proposals so no grads required)
    for param in reward_net.parameters():
        param.requires_grad = False

    #get num_demos by num_features + 1 (bias) numpy array with (un-discounted) feature counts from pretrained network
    directories = args.pretrained_network.split("/") #split on directories to get the last past
    filename = directories[-1] #last element should be the name of the pretrained network
    fname = filename.split(".")[0] #get first part before the .params_...
    demo_cnts = generate_feature_counts(demonstrations, reward_net) #compute the fcounts
Exemplo n.º 3
0
#     class Net(nn.Module):
#         def __init__(self):
#             super().__init__()
#
#             self.conv1 = nn.Conv2d(4, 16, 7, stride=3)
#             self.conv2 = nn.Conv2d(16, 16, 5, stride=2)
#             self.conv3 = nn.Conv2d(16, 16, 3, stride=1)
#             self.conv4 = nn.Conv2d(16, 16, 3, stride=1)
#
#             # This is the width of the layer between the convolved framestack
#             # and the actual latent space. Scales with ENCODING_DIMS
#             intermediate_dimension = min(784, max(64, ENCODING_DIMS*2))
#
#             # Brings the convolved frame down to intermediate dimension just
#             # before being sent to latent space
#             self.fc1 = nn.Linear(784, intermediate_dimension)
#
#             # This brings from intermediate dimension to latent space. Named mu
#             # because in the full network it includes a var also, to sample for
#             # the autoencoder
#             self.fc_mu = nn.Linear(intermediate_dimension, ENCODING_DIMS)
#
#             # This is the actual T-REX layer; linear comb. from ENCODING_DIMS
#             self.fc2 = nn.Linear(ENCODING_DIMS, 1)

net = EmbeddingNet(ENCODING_DIMS)
sd = net.state_dict()
sd.update({k: v for k, v in model.items() if k in net.state_dict()})

torch.save(sd, sys.argv[2])
                        type=int,
                        default=50000,
                        help="how long to run before truncating policy")

    args = parser.parse_args()
    env_name = args.env_name
    #set seeds
    seed = int(args.seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)

    network_file_loc = args.pretrained_network
    print("Using network at", network_file_loc, "for features.")
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    feature_net = EmbeddingNet(args.encoding_dims)
    state_dict = torch.load(network_file_loc, map_location=device)
    print(state_dict.keys())
    feature_net.load_state_dict(
        torch.load(network_file_loc, map_location=device))
    feature_net.to(device)

    print("evaluating", args.checkpointpath)
    print("*" * 10)
    print(env_name)
    print("*" * 10)
    returns, ave_feature_counts, fcounts, num_steps = get_policy_feature_counts(
        env_name, args.checkpointpath, feature_net, args.num_rollouts,
        args.max_length, args.no_op)
    print("returns", returns)
    print("feature counts", ave_feature_counts)
Exemplo n.º 5
0
### This code will take in any pretrained network and compute the expected feature counts via Monte Carlo sampling according to the last
### layer of the pretrained network

import os
import sys
import pickle
import gym
import time
import numpy as np
import random
import torch
from run_test import *
#import matplotlib.pylab as plt
import argparse
from StrippedNet import EmbeddingNet
from baselines.common.trex_utils import preprocess
import utils

network_file_loc = "/home/dsbrown/Code/deep-bayesian-irl/pretrained_networks/auxloss/breakout_64_all.params_stripped.params"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
feature_net = EmbeddingNet(64)
state_dict = torch.load(network_file_loc, map_location=device)
print(state_dict.keys())
print(state_dict['fc2.bias'])
feature_net.load_state_dict(torch.load(network_file_loc, map_location=device))