예제 #1
0
def experiment(variant):
    from simple_sup_lstm import SimpleSupLSTMEnv
    expl_env = SimpleSupLSTMEnv(**variant['env_kwargs'])
    eval_env = SimpleSupLSTMEnv(**variant['env_kwargs'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n
    label_num = expl_env.label_num
    label_dim = expl_env.label_dim

    if variant['load_kwargs']['load']:
        load_dir = variant['load_kwargs']['load_dir']
        load_data = torch.load(load_dir + '/params.pkl', map_location='cpu')
        policy = load_data['trainer/policy']
        vf = load_data['trainer/value_function']
    else:
        hidden_dim = variant['lstm_kwargs']['hidden_dim']
        num_lstm_layers = variant['lstm_kwargs']['num_layers']
        node_dim = variant['gnn_kwargs']['node_dim']
        node_num = expl_env.node_num

        # policy module
        input_node_dim = int(obs_dim / node_num + label_dim)
        a_0 = np.zeros(action_dim)
        h1_0 = np.zeros((node_num, hidden_dim * num_lstm_layers))
        c1_0 = np.zeros((node_num, hidden_dim * num_lstm_layers))
        h2_0 = np.zeros((node_num, hidden_dim * num_lstm_layers))
        c2_0 = np.zeros((node_num, hidden_dim * num_lstm_layers))
        latent_0 = (h1_0, c1_0, h2_0, c2_0)
        from lstm_net import LSTMNet
        lstm1_ego = LSTMNet(input_node_dim, action_dim, hidden_dim,
                            num_lstm_layers)
        lstm1_other = LSTMNet(input_node_dim, 0, hidden_dim, num_lstm_layers)
        lstm2_ego = LSTMNet(node_dim, 0, hidden_dim, num_lstm_layers)
        lstm2_other = LSTMNet(node_dim, 0, hidden_dim, num_lstm_layers)
        from graph_builder import TrafficGraphBuilder
        gb = TrafficGraphBuilder(
            input_dim=hidden_dim,
            node_num=node_num,
            ego_init=torch.tensor([0., 1.]),
            other_init=torch.tensor([1., 0.]),
        )
        from gnn_net import GNNNet
        gnn = GNNNet(
            pre_graph_builder=gb,
            node_dim=variant['gnn_kwargs']['node_dim'],
            conv_type=variant['gnn_kwargs']['conv_type'],
            num_conv_layers=variant['gnn_kwargs']['num_layers'],
            hidden_activation=variant['gnn_kwargs']['activation'],
        )
        from gnn_lstm2_net import GNNLSTM2Net
        gnnlstm_net = GNNLSTM2Net(node_num, gnn, lstm1_ego, lstm1_other,
                                  lstm2_ego, lstm2_other)
        from layers import FlattenLayer, SelectLayer
        post_net = nn.Sequential(SelectLayer(-2, 0), FlattenLayer(2),
                                 nn.ReLU(), nn.Linear(hidden_dim, action_dim))
        from softmax_lstm_policy import SoftmaxLSTMPolicy
        policy = SoftmaxLSTMPolicy(
            a_0=a_0,
            latent_0=latent_0,
            obs_dim=obs_dim,
            action_dim=action_dim,
            lstm_net=gnnlstm_net,
            post_net=post_net,
        )

        # sup_learner module
        input_node_dim = int(obs_dim / node_num)
        a_0 = np.zeros(action_dim)
        h1_0 = np.zeros((node_num, hidden_dim * num_lstm_layers))
        c1_0 = np.zeros((node_num, hidden_dim * num_lstm_layers))
        h2_0 = np.zeros((node_num, hidden_dim * num_lstm_layers))
        c2_0 = np.zeros((node_num, hidden_dim * num_lstm_layers))
        latent_0 = (h1_0, c1_0, h2_0, c2_0)
        from lstm_net import LSTMNet
        lstm1_ego = LSTMNet(input_node_dim, action_dim, hidden_dim,
                            num_lstm_layers)
        lstm1_other = LSTMNet(input_node_dim, 0, hidden_dim, num_lstm_layers)
        lstm2_ego = LSTMNet(node_dim, 0, hidden_dim, num_lstm_layers)
        lstm2_other = LSTMNet(node_dim, 0, hidden_dim, num_lstm_layers)
        from graph_builder import TrafficGraphBuilder
        gb = TrafficGraphBuilder(
            input_dim=hidden_dim,
            node_num=node_num,
            ego_init=torch.tensor([0., 1.]),
            other_init=torch.tensor([1., 0.]),
        )
        from gnn_net import GNNNet
        gnn = GNNNet(
            pre_graph_builder=gb,
            node_dim=variant['gnn_kwargs']['node_dim'],
            conv_type=variant['gnn_kwargs']['conv_type'],
            num_conv_layers=variant['gnn_kwargs']['num_layers'],
            hidden_activation=variant['gnn_kwargs']['activation'],
        )
        from gnn_lstm2_net import GNNLSTM2Net
        gnnlstm_net = GNNLSTM2Net(node_num, gnn, lstm1_ego, lstm1_other,
                                  lstm2_ego, lstm2_other)
        from layers import FlattenLayer, SelectLayer
        post_net = nn.Sequential(
            SelectLayer(-2, np.arange(1, node_num)),
            nn.ReLU(),
            nn.Linear(hidden_dim, label_dim),
        )
        from softmax_lstm_policy import SoftmaxLSTMPolicy
        sup_learner = SoftmaxLSTMPolicy(
            a_0=a_0,
            latent_0=latent_0,
            obs_dim=obs_dim,
            action_dim=action_dim,
            lstm_net=gnnlstm_net,
            post_net=post_net,
        )

        # policy
        from sup_sep_softmax_lstm_policy import SupSepSoftmaxLSTMPolicy
        policy = SupSepSoftmaxLSTMPolicy(
            obs_dim=obs_dim,
            action_dim=action_dim,
            policy=policy,
            sup_learner=sup_learner,
            label_num=label_num,
            label_dim=label_dim,
        )
        print('parameters: ',
              np.sum([p.view(-1).shape[0] for p in policy.parameters()]))

        vf = Mlp(
            hidden_sizes=[32, 32],
            input_size=obs_dim,
            output_size=1,
        )

    vf_criterion = nn.MSELoss()
    from rlkit.torch.policies.make_deterministic import MakeDeterministic
    eval_policy = MakeDeterministic(policy)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    from sup_sep_rollout import sup_sep_rollout
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
        rollout_fn=sup_sep_rollout,
    )
    from sup_replay_buffer import SupReplayBuffer
    replay_buffer = SupReplayBuffer(
        observation_dim=obs_dim,
        action_dim=action_dim,
        label_dim=label_num,
        max_replay_buffer_size=int(1e6),
        max_path_length=max_path_length,
        recurrent=True,
    )

    from rlkit.torch.vpg.ppo_sup_sep import PPOSupSepTrainer
    trainer = PPOSupSepTrainer(policy=policy,
                               value_function=vf,
                               vf_criterion=vf_criterion,
                               replay_buffer=replay_buffer,
                               recurrent=True,
                               **variant['trainer_kwargs'])
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
예제 #2
0
def experiment(variant):
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name, **variant['env_kwargs'])
    eval_env = make_env(args.exp_name, **variant['env_kwargs'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n
    label_num = expl_env.label_num
    label_dim = expl_env.label_dim
    max_path_length = variant['trainer_kwargs']['max_path_length']

    if variant['load_kwargs']['load']:
        load_dir = variant['load_kwargs']['load_dir']
        load_data = torch.load(load_dir + '/params.pkl', map_location='cpu')
        policy = load_data['trainer/policy']
        vf = load_data['trainer/value_function']
    else:
        hidden_dim = variant['lstm_kwargs']['hidden_dim']
        num_layers = variant['lstm_kwargs']['num_layers']

        # policy module
        a_0 = np.zeros(action_dim)
        h_0 = np.zeros(hidden_dim * num_layers)
        c_0 = np.zeros(hidden_dim * num_layers)
        latent_0 = (h_0, c_0)
        from lstm_net import LSTMNet
        lstm_net = LSTMNet(int(obs_dim + (label_num + 1) * label_dim),
                           action_dim, hidden_dim, num_layers)
        post_net = torch.nn.Linear(hidden_dim, action_dim)
        from softmax_lstm_policy import SoftmaxLSTMPolicy
        policy = SoftmaxLSTMPolicy(
            a_0=a_0,
            latent_0=latent_0,
            obs_dim=obs_dim,
            action_dim=action_dim,
            lstm_net=lstm_net,
            post_net=post_net,
        )

        # sup_learner module
        a_0 = np.zeros(action_dim)
        h_0 = np.zeros(hidden_dim * num_layers)
        c_0 = np.zeros(hidden_dim * num_layers)
        latent_0 = (h_0, c_0)
        lstm_net = LSTMNet(obs_dim, action_dim, hidden_dim, num_layers)
        from layers import ReshapeLayer
        post_net = nn.Sequential(
            nn.Linear(hidden_dim, int(label_num * label_dim)),
            ReshapeLayer(shape=(label_num, label_dim)),
        )
        from softmax_lstm_policy import SoftmaxLSTMPolicy
        sup_learner = SoftmaxLSTMPolicy(
            a_0=a_0,
            latent_0=latent_0,
            obs_dim=obs_dim,
            action_dim=action_dim,
            lstm_net=lstm_net,
            post_net=post_net,
        )

        # policy
        from sup_sep_softmax_lstm_policy import SupSepSoftmaxLSTMPolicy
        policy = SupSepSoftmaxLSTMPolicy(
            obs_dim=obs_dim,
            action_dim=action_dim,
            policy=policy,
            sup_learner=sup_learner,
            label_num=label_num,
            label_dim=label_dim,
        )
        print('parameters: ',
              np.sum([p.view(-1).shape[0] for p in policy.parameters()]))

        vf = Mlp(
            hidden_sizes=[32, 32],
            input_size=obs_dim,
            output_size=1,
        )

    vf_criterion = nn.MSELoss()
    from rlkit.torch.policies.make_deterministic import MakeDeterministic
    eval_policy = MakeDeterministic(policy)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    from sup_sep_rollout import sup_sep_rollout
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
        rollout_fn=sup_sep_rollout,
    )
    from sup_replay_buffer import SupReplayBuffer
    replay_buffer = SupReplayBuffer(
        observation_dim=obs_dim,
        action_dim=action_dim,
        label_dim=label_num,
        max_replay_buffer_size=int(1e6),
        max_path_length=max_path_length,
        recurrent=True,
    )

    from rlkit.torch.vpg.ppo_sup_sep import PPOSupSepTrainer
    trainer = PPOSupSepTrainer(policy=policy,
                               value_function=vf,
                               vf_criterion=vf_criterion,
                               replay_buffer=replay_buffer,
                               recurrent=True,
                               **variant['trainer_kwargs'])
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        log_path_function=get_traffic_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()