Пример #1
0
 def __init__(
         self,
         hidden_sizes,
         obs_dim,
         action_dim,
         init_w=3e-3,
         hidden_activation=F.relu,
         output_activation=identity,
         hidden_init=ptu.fanin_init,
         b_init_value=0.1,
         layer_norm=False,
         layer_norm_kwargs=None,
 ):
     super().__init__()
     self.fc1 = Mlp(
         input_size=obs_dim + action_dim,
         hidden_sizes=[],
         output_size=hidden_sizes[0],
         output_activation=hidden_activation,
         layer_norm=layer_norm,
     )
     self.fc2 = Mlp(
         input_size=action_dim + hidden_sizes[0],
         hidden_sizes=hidden_sizes[1:],
         output_size=1,
         output_activation=output_activation,
         layer_norm=layer_norm,
     )
Пример #2
0
def get_network(network_args, obs_dim, action_dim):
    
    
    if (network_args["type"] == "conv_mixed"):
        from surprise.envs.vizdoom.networks import VizdoomQF
        qf = VizdoomQF(actions=action_dim, **network_args)
        target_qf = VizdoomQF(actions=action_dim, **network_args)
    elif (network_args["type"] == "conv"):
        from surprise.envs.vizdoom.networks import VizdoomFeaturizer
        print ("Using conv")
        qf = VizdoomFeaturizer(dim=action_dim, **network_args)
        target_qf = VizdoomFeaturizer(dim=action_dim, **network_args)
    else:
        from rlkit.torch.networks import Mlp
        qf = Mlp(
            hidden_sizes=[128, 64, 32],
            input_size=obs_dim[0],
            output_size=action_dim,
        )
        target_qf = Mlp(
            hidden_sizes=[128, 64, 32],
            input_size=obs_dim[0],
            output_size=action_dim,
        )
    
    return (qf, target_qf)
Пример #3
0
    def __init__(
        self,
        input_dim,
        output_dim,
        latent_dims,
        encode_mlp_kwargs,
        decode_mlp_kwargs,
        no_gradient,
    ):
        super(MLPAutoEncoder, self).__init__()

        self.no_gradient = no_gradient

        self.encode_mlps = nn.ModuleList()
        for latent_dim in latent_dims:
            mlp = Mlp(
                input_size=input_dim,
                output_size=latent_dim,
                **encode_mlp_kwargs,
            )
            self.encode_mlps.append(mlp)

        self.decode_mlp = Mlp(
            input_size=np.sum(latent_dims),
            output_size=output_dim,
            **decode_mlp_kwargs,
        )
Пример #4
0
    def __init__(self, env):
        self.env = env
        self.width = self.env.grid.height
        self.height = self.env.grid.height
        self.abstract_dim = 4
        self.state_dim = 2
        self.states = []
        self.state_to_idx = None

        self.encoder = Mlp((64, 64, 64),
                           output_size=self.abstract_dim,
                           input_size=self.state_dim,
                           output_activation=F.softmax,
                           layer_norm=False)

        states = []
        for j in range(self.env.grid.height):
            for i in range(self.env.grid.width):
                if self.env.grid.get(i, j) == None:
                    states.append((i, j))

        self.states = states
        self.states_np = np.array(states)
        self.state_to_idx = {s: i for i, s in enumerate(self.states)}

        self.next_states = []
        for i, state in enumerate(states):
            next_states = self._gen_transitions(state)
            self.next_states.append(next_states)

        self.next_states = np.array(self.next_states)

        self.encoder.cuda()
        self.optimizer = optim.Adam(self.encoder.parameters(), lr=1e-4)
Пример #5
0
    def __init__(self, env):
        self.env = env
        self.width = self.env.grid.height
        self.height = self.env.grid.height
        self.abstract_dim = 4
        self.state_dim = 2
        self.states = []
        self.state_to_idx = None

        self.encoder = Mlp((64, 64, 64),
                           output_size=self.abstract_dim,
                           input_size=self.state_dim,
                           output_activation=F.softmax,
                           layer_norm=True)

        states = []
        for j in range(self.env.grid.height):
            for i in range(self.env.grid.width):
                if self.env.grid.get(i, j) == None:
                    states.append((i, j))
        state_to_idx = {s: i for i, s in enumerate(states)}

        self.states = states
        self.state_to_idx = state_to_idx

        transitions = []
        for i, state in enumerate(states):
            next_states = self._gen_transitions(state)
            for ns in next_states:
                transitions.append(list(state) + list(ns))
        self.transitions = transitions

        self.optimizer = optim.Adam(self.encoder.parameters())
Пример #6
0
def experiment(variant):
    expl_env = gym.make("CartPole-v0")
    eval_env = gym.make("CartPole-v0")
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    qf = Mlp(hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim)
    target_qf = Mlp(hidden_sizes=[32, 32],
                    input_size=obs_dim,
                    output_size=action_dim)
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space), eval_policy)
    eval_path_collector = MdpPathCollector(eval_env, eval_policy)
    expl_path_collector = MdpPathCollector(expl_env, expl_policy)
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant["trainer_kwargs"])
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env)
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
    def __init__(self, envs):
        self.envs = [EnvContainer(env) for env in envs]

        self.n_envs = len(self.envs)
        self.n_abstract_mdps = 2
        self.abstract_dim = 4
        self.state_dim = 4
        self.states = []
        self.state_to_idx = None

        all_encoder_lst = nn.ModuleList()
        for i in range(self.n_envs):
            encoder_lst = nn.ModuleList()
            for j in range(self.n_abstract_mdps):
                encoder = Mlp((128, 128, 128),
                              output_size=self.abstract_dim,
                              input_size=self.state_dim,
                              output_activation=F.softmax,
                              layer_norm=True)
                encoder.apply(init_weights)
                encoder_lst.append(encoder)

            all_encoder_lst.append(encoder_lst)
        self.all_encoder_lst = all_encoder_lst

        self.optimizer = optim.Adam(self.all_encoder_lst.parameters(), lr=1e-4)
Пример #8
0
    def __init__(self, trunk_params, split_heads_params):
        self.save_init_params(locals())
        super().__init__()

        trunk_params['output_activation'] = F.relu
        self.trunk = Mlp(**trunk_params)
        self.mean_mlp = Mlp(**split_heads_params)
        self.log_sig_mlp = Mlp(**split_heads_params)
Пример #9
0
def experiment(variant):
    import sys
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name)
    eval_env = make_env(args.exp_name)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    module = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    policy = SoftmaxPolicy(module, **variant['policy_kwargs'])
    qf1 = Mlp(input_size=obs_dim,
              output_size=action_dim,
              **variant['qf_kwargs'])
    target_qf1 = copy.deepcopy(qf1)
    qf2 = Mlp(input_size=obs_dim,
              output_size=action_dim,
              **variant['qf_kwargs'])
    target_qf2 = copy.deepcopy(qf2)

    eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    qf_criterion = nn.MSELoss()
    trainer = SACDiscreteTrainer(env=eval_env,
                                 policy=policy,
                                 qf1=qf1,
                                 qf2=qf2,
                                 target_qf1=target_qf1,
                                 target_qf2=target_qf2,
                                 qf_criterion=qf_criterion,
                                 **variant['trainer_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #10
0
 def initialize_dynamics_model(self):
     obs_dim = self._obs[self.observation_key].shape[1]
     self.dynamics_model = Mlp(
         hidden_sizes=[128, 128],
         output_size=obs_dim,
         input_size=obs_dim + self._action_dim,
     )
     self.dynamics_model.to(ptu.device)
     self.dynamics_optimizer = Adam(self.dynamics_model.parameters())
     self.dynamics_loss = MSELoss()
Пример #11
0
    def __init__(self,
                 env,
                 network,
                 device=0,
                 obs_key=None,
                 hist_size=5000,
                 reward_func=None,
                 **kwargs):
        #         from surprise.envs.vizdoom.networks import VAEConv
        #         from surprise.envs.vizdoom.buffer import VAEBuffer
        from surprise.envs.vizdoom.buffer import SimpleBuffer
        from surprise.envs.vizdoom.networks import VizdoomFeaturizer
        from rlkit.torch.networks import Mlp
        from torch import optim
        from gym import spaces
        '''
        params
        ======
        env (gym.Env) : environment to wrap

        '''
        self.device = device
        self.env = env
        self._obs_key = obs_key
        self._reward_func = reward_func

        # Gym spaces
        self.action_space = env.action_space
        self.observation_space = env.observation_space

        #         RND stuff
        self._buffer = SimpleBuffer(device=self.device, size=hist_size)
        if (kwargs["network_type"] == "flat"):
            self.target_net = Mlp(
                hidden_sizes=[128, 64],
                input_size=self.observation_space.low.size,
                output_size=64,
            ).to(self.device)
            self.target_net.eval()
            self.pred_net = Mlp(
                hidden_sizes=[128, 64, 32],
                input_size=self.observation_space.low.size,
                output_size=64,
            ).to(self.device)
        else:
            self.target_net = VizdoomFeaturizer(kwargs["encoding_size"]).to(
                self.device)
            self.target_net.eval()
            self.pred_net = VizdoomFeaturizer(kwargs["encoding_size"]).to(
                self.device)
        self.optimizer = optim.Adam(self.pred_net.parameters(), lr=1e-4)
        self.network = self.pred_net
        self.step_freq = 16
        self.loss = torch.zeros(1)
Пример #12
0
def experiment(variant):
    # Select a different success_function for different tasks.
    expl_env = GymCraftingEnv(state_obs=True,
                              few_obj=True,
                              success_function=eval_eatbread)
    eval_env = GymCraftingEnv(state_obs=True,
                              few_obj=True,
                              success_function=eval_eatbread)
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    target_qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space),
        eval_policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant['trainer_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #13
0
def experiment(variant):
    """Run the experiment."""
    eval_env = gym.make('CartPole-v0')
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n
    # Collect data.
    print('Collecting data...')
    data = []
    while len(data) < variant['offline_data_size']:
        done = False
        s = eval_env.reset()
        while not done:
            a = np.random.randint(action_dim)
            n, r, done, _ = eval_env.step(a)
            one_hot_a = np.zeros(action_dim)
            one_hot_a[a] = 1
            data.append((s, one_hot_a, r, n, done))
            s = n
            if len(data) == variant['offline_data_size']:
                break

    qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    target_qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    trainer = DQNTrainer(
        qf=qf,
        target_qf=target_qf,
        qf_criterion=qf_criterion,
        **variant['trainer_kwargs']
    )
    offline_data = OfflineDataStore(data=data,)
    algorithm = TorchOfflineRLAlgorithm(
        trainer=trainer,
        evaluation_env=eval_env,
        evaluation_data_collector=eval_path_collector,
        offline_data=offline_data,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Пример #14
0
def experiment(variant):
    args = getArgs()
    # expl_env = NormalizedBoxEnv(environment(args))

    expl_env = environment(args, 'dqn')
    eval_env = environment(args, 'dqn')
    # expl_env.render()
    obs_dim = expl_env.get_obsdim()
    action_dim = expl_env.action_space.n

    qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    target_qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space),
        eval_policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant['trainer_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #15
0
    def __init__(self, envs):
        self.envs = [EnvContainer(env) for env in envs]

        self.n_abstract_mdps = 2
        self.abstract_dim = 4
        self.state_dim = 4
        self.states = []
        self.state_to_idx = None

        self.encoder = Mlp((64, 64, 64), output_size=self.abstract_dim, input_size=self.state_dim,
                           output_activation=F.softmax, layer_norm=True)
        self.transitions = nn.Parameter(torch.zeros((self.abstract_dim, self.abstract_dim)))

        self.optimizer = optim.Adam(self.encoder.parameters())
Пример #16
0
    def __init__(
            self,
            # params for the mlp that encodes each timestep
            timestep_enc_params,
            # params for the mlp that
            traj_enc_params):
        self.save_init_params(locals())
        super().__init__()

        timestep_enc_params['output_activation'] = F.relu
        self.timestep_mlp = Mlp(**timestep_enc_params)
        # the relu below that has been commented out seriously hurts performance
        # traj_enc_params['output_activation'] = F.relu
        self.traj_enc_mlp = Mlp(**traj_enc_params)
        self.output_size = self.traj_enc_mlp.output_size
Пример #17
0
def experiment(variant):
    env_sampler = MazeSampler(variant['env_specs'])
    env, _ = env_sampler()

    if variant['conv_input']:
        qf = ConvNet(kernel_sizes=variant['kernel_sizes'],
                     num_channels=variant['num_channels'],
                     strides=variant['strides'],
                     paddings=variant['paddings'],
                     hidden_sizes=variant['hidden_sizes'],
                     input_size=env.observation_space.shape,
                     output_size=env.action_space.n)
    else:
        qf = Mlp(
            hidden_sizes=[
                variant['net_size'] for _ in range(variant['num_layers'])
            ],
            input_size=int(np.prod(env.observation_space.shape)),
            output_size=env.action_space.n,
        )
    qf_criterion = nn.MSELoss()
    # Use this to switch to DoubleDQN
    # algorithm = DoubleDQN(
    print('WTF is going on!')
    print(env_sampler)
    algorithm = MetaDQN(env_sampler=env_sampler,
                        qf=qf,
                        qf_criterion=qf_criterion,
                        **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Пример #18
0
    def __init__(self, 
                pre_graph_builder, 
                node_dim,
                output_dim,
                post_mlp_kwargs,
                num_conv_layers=3,
                ):
        super(GNNNet, self).__init__()

        # graph builder
        self.pre_graph_builder = pre_graph_builder

        # convs
        self.node_input_dim = pre_graph_builder.output_dim
        self.node_dim = node_dim
        self.num_conv_layers = num_conv_layers
        self.convs = self.build_convs(self.node_input_dim, self.node_dim, self.num_conv_layers)

        # post qf
        self.output_dim = output_dim
        self.post_mlp_kwargs = post_mlp_kwargs
        self.post_mlp = Mlp(
                        input_size=self.node_dim,
                        output_size=self.output_dim,
                        **self.post_mlp_kwargs
                        )
Пример #19
0
 def __init__(
     self,
     representation_size,
     input_size,
     hidden_sizes,
     init_w=1e-3,
     hidden_init=ptu.fanin_init,
     output_activation=identity,
     output_scale=1,
     layer_norm=False,
 ):
     super().__init__()
     self.representation_size = representation_size
     self.hidden_init = hidden_init
     self.output_activation = output_activation
     self.dist_mu = np.zeros(self.representation_size)
     self.dist_std = np.ones(self.representation_size)
     self.relu = nn.ReLU()
     self.sigmoid = nn.Sigmoid()
     self.init_w = init_w
     hidden_sizes = list(hidden_sizes)
     self.encoder = TwoHeadMlp(hidden_sizes,
                               representation_size,
                               representation_size,
                               input_size,
                               layer_norm=layer_norm)
     hidden_sizes.reverse()
     self.decoder = Mlp(hidden_sizes,
                        input_size,
                        representation_size,
                        layer_norm=layer_norm,
                        output_activation=output_activation,
                        output_bias=None)
     self.output_scale = output_scale
Пример #20
0
def get_non_linear_results(
    ob_space,
    encoder,
    latent_dim,
    batch_size=128,
    num_batches=10000,
) -> NonLinearResults:
    state_dim = ob_space.low.size

    decoder = Mlp(
        hidden_sizes=[64, 64],
        output_size=state_dim,
        input_size=latent_dim,
    )
    decoder.to(ptu.device)
    optimizer = optim.Adam(decoder.parameters())

    initial_loss = last_10_percent_loss = 0
    for i in range(num_batches):
        states = get_batch(ob_space, batch_size)
        x = ptu.from_numpy(states)
        z = encoder(x)
        x_hat = decoder(z)

        loss = ((x - x_hat)**2).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i == 0:
            initial_loss = ptu.get_numpy(loss)
        if i == int(num_batches * 0.9):
            last_10_percent_loss = ptu.get_numpy(loss)

    eval_states = get_batch(ob_space, batch_size=2**15)
    x = ptu.from_numpy(eval_states)
    z = encoder(x)
    x_hat = decoder(z)
    reconstruction = ptu.get_numpy(x_hat)
    loss = ((eval_states - reconstruction)**2).mean()
    last_10_percent_contribution = (
        (last_10_percent_loss - loss)) / (initial_loss - loss)
    del decoder, optimizer
    return NonLinearResults(
        loss=loss,
        initial_loss=initial_loss,
        last_10_percent_contribution=last_10_percent_contribution,
    )
Пример #21
0
def experiment(variant):
    from simple_sup import SimpleSupEnv
    expl_env = SimpleSupEnv(**variant['env_kwars'])
    eval_env = SimpleSupEnv(**variant['env_kwars'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    hidden_dim = variant['hidden_dim']
    encoder = nn.Sequential(
             nn.Linear(obs_dim,hidden_dim),
             nn.ReLU(),
             nn.Linear(hidden_dim,hidden_dim),
             nn.ReLU(),
            )
    decoder = nn.Linear(hidden_dim, action_dim)
    from layers import ReshapeLayer
    sup_learner = nn.Sequential(
            nn.Linear(hidden_dim, action_dim),
            ReshapeLayer(shape=(1, action_dim)),
        )
    from sup_softmax_policy import SupSoftmaxPolicy
    policy = SupSoftmaxPolicy(encoder, decoder, sup_learner)
    print('parameters: ',np.sum([p.view(-1).shape[0] for p in policy.parameters()]))

    vf = Mlp(
        hidden_sizes=[32],
        input_size=obs_dim,
        output_size=1,
    )
    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy,use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )

    from rlkit.torch.vpg.ppo_sup_online import PPOSupOnlineTrainer
    trainer = PPOSupOnlineTrainer(
        policy=policy,
        value_function=vf,
        vf_criterion=vf_criterion,
        **variant['trainer_kwargs']
    )
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Пример #22
0
def experiment(variant):
    from simple_sup import SimpleSupEnv
    expl_env = SimpleSupEnv(**variant['env_kwars'])
    eval_env = SimpleSupEnv(**variant['env_kwars'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    encoder = nn.Sequential(
        nn.Linear(obs_dim, 16),
        nn.ReLU(),
    )
    decoder = nn.Linear(16, action_dim)
    from layers import ReshapeLayer
    sup_learner = nn.Sequential(
        nn.Linear(16, action_dim),
        ReshapeLayer(shape=(1, action_dim)),
    )
    from sup_softmax_policy import SupSoftmaxPolicy
    policy = SupSoftmaxPolicy(encoder, decoder, sup_learner)

    vf = Mlp(
        hidden_sizes=[32],
        input_size=obs_dim,
        output_size=1,
    )
    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    from sup_replay_buffer import SupReplayBuffer
    replay_buffer = SupReplayBuffer(
        observation_dim=obs_dim,
        label_dim=1,
        max_replay_buffer_size=int(1e6),
    )

    from rlkit.torch.vpg.trpo_sup import TRPOSupTrainer
    trainer = TRPOSupTrainer(policy=policy,
                             value_function=vf,
                             vf_criterion=vf_criterion,
                             replay_buffer=replay_buffer,
                             **variant['trainer_kwargs'])
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #23
0
def experiment(variant):
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name, **variant['env_kwargs'])
    eval_env = make_env(args.exp_name, **variant['env_kwargs'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n
    label_num = expl_env.label_num
    label_dim = expl_env.label_dim

    encoder = nn.Sequential(
        nn.Linear(obs_dim, 32),
        nn.ReLU(),
        nn.Linear(32, 32),
        nn.ReLU(),
    )
    decoder = nn.Linear(32, action_dim)
    from layers import ReshapeLayer
    sup_learner = nn.Sequential(
        nn.Linear(32, int(label_num * label_dim)),
        ReshapeLayer(shape=(label_num, label_dim)),
    )
    from sup_softmax_policy import SupSoftmaxPolicy
    policy = SupSoftmaxPolicy(encoder, decoder, sup_learner)
    print('parameters: ',
          np.sum([p.view(-1).shape[0] for p in policy.parameters()]))

    vf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=1,
    )
    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = TRPOTrainer(policy=policy,
                          value_function=vf,
                          vf_criterion=vf_criterion,
                          **variant['trainer_kwargs'])
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        log_path_function=get_traffic_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #24
0
def experiment(variant):
    env = DiscreteSwimmerEnv(**variant['env_params'])

    qf = Mlp(input_size=int(np.prod(env.observation_space.shape)),
             output_size=env.action_space.n,
             **variant['qf_kwargs'])
    algorithm = DQN(env, qf=qf, **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
def gen_network_num_obj(variant, action_dim, layer_size, policy=False):
    return FoodNetworkMediumPartialObsTaskNumObj(
        img_network=Mlp(**variant['full_img_network_kwargs']),
        inventory_network=FlattenMlp(**variant['inventory_network_kwargs']),
        num_obj_network=Mlp(**variant['num_obj_network_kwargs']),
        final_network=FlattenMlp(
            input_size=variant['full_img_network_kwargs']['output_size'] +
            variant['inventory_network_kwargs']['output_size'] +
            variant['num_obj_network_kwargs']['output_size'],
            output_size=action_dim,
            hidden_sizes=[layer_size, layer_size],
            output_activation=F.softmax if policy else identity),
        sizes=[
            variant['full_img_network_kwargs']['input_size'],
            # shelf dim
            64,
            # num made objs
            8
        ])
Пример #26
0
 def __init__(self, enc_hidden_sizes, z_dim, classifier_hidden_sizes):
     super(Classifier, self).__init__()
     self.enc = Mlp(
         enc_hidden_sizes,
         z_dim,
         6,
         hidden_activation=torch.nn.functional.relu,
         # batch_norm=True
         # layer_norm=True
     )
     self.classifier = Mlp(
         classifier_hidden_sizes,
         1,
         z_dim + 6,
         hidden_activation=torch.nn.functional.relu,
         # batch_norm=True
         # layer_norm=True
     )
     self.z_dim = z_dim
Пример #27
0
def experiment(variant):
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name, **variant['env_kwargs'])
    eval_env = make_env(args.exp_name, **variant['env_kwargs'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n
    label_num = expl_env.label_num
    label_dim = expl_env.label_dim

    if variant['load_kwargs']['load']:
        load_dir = variant['load_kwargs']['load_dir']
        load_data = torch.load(load_dir + '/params.pkl', map_location='cpu')
        policy = load_data['trainer/policy']
        vf = load_data['trainer/value_function']
    else:
        hidden_dim = variant['mlp_kwargs']['hidden']
        policy = nn.Sequential(nn.Linear(obs_dim, hidden_dim), nn.ReLU(),
                               nn.Linear(hidden_dim, hidden_dim), nn.ReLU(),
                               nn.Linear(hidden_dim, action_dim))
        policy = SoftmaxPolicy(policy)
        print('parameters: ',
              np.sum([p.view(-1).shape[0] for p in policy.parameters()]))

        vf = Mlp(
            hidden_sizes=[32, 32],
            input_size=obs_dim,
            output_size=1,
        )

    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = PPOTrainer(policy=policy,
                         value_function=vf,
                         vf_criterion=vf_criterion,
                         **variant['trainer_kwargs'])
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        log_path_function=get_traffic_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #28
0
def experiment(variant):
    env = variant['env_class'](**variant['env_kwargs'])
    env = DiscretizeEnv(env, variant['num_bins'])
    # env = DiscreteReacherEnv(**variant['env_kwargs'])

    qf = Mlp(input_size=int(np.prod(env.observation_space.shape)),
             output_size=env.action_space.n,
             **variant['qf_kwargs'])
    algorithm = FiniteHorizonDQN(env, qf, **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Пример #29
0
    def __init__(
        self,
        K,
        representation_size,
        action_size,
    ):
        super().__init__()
        self.K = K
        self.rep_size = representation_size
        self.action_size = action_size

        self.effect_size = 16
        self.enc_rep_size = representation_size - self.effect_size
        self.interaction_size = 128

        #self.action_encoder = Mlp((128,), self.action_enc_size, action_size, hidden_activation=nn.ELU())

        self.lambda_encoder = Mlp((128, ),
                                  self.enc_rep_size,
                                  representation_size,
                                  hidden_activation=nn.ELU())
        self.embedding_network = Mlp((256, ),
                                     self.interaction_size,
                                     self.enc_rep_size * 2,
                                     hidden_activation=nn.ELU(),
                                     output_activation=nn.ELU())
        self.effect_network = Mlp((128, ),
                                  self.interaction_size,
                                  self.interaction_size,
                                  hidden_activation=nn.ELU(),
                                  output_activation=nn.ELU())
        self.attention_network = Mlp((128, ),
                                     1,
                                     self.interaction_size,
                                     hidden_activation=nn.ELU(),
                                     output_activation=nn.Sigmoid())
        self.encoder_network = Mlp((128, ),
                                   self.effect_size,
                                   self.interaction_size,
                                   hidden_activation=nn.ELU())
Пример #30
0
def gen_network(variant, action_dim, layer_size, policy=False):
    return FlatFoodNetworkMedium(
        img_network=Mlp(**variant['img_network_kwargs']),
        full_img_network=Mlp(**variant['full_img_network_kwargs']),
        inventory_network=FlattenMlp(**variant['inventory_network_kwargs']),
        final_network=FlattenMlp(
            input_size=variant['img_network_kwargs']['output_size'] +
            variant['full_img_network_kwargs']['output_size'] +
            variant['inventory_network_kwargs']['output_size'],
            output_size=action_dim,
            hidden_sizes=[layer_size, layer_size],
            output_activation=F.softmax if policy else identity),
        sizes=[
            variant['img_network_kwargs']['input_size'],
            variant['full_img_network_kwargs']['input_size'],
            # health dim
            1,
            # pantry dim
            400,
            # shelf dim
            40
        ])