Exemplo n.º 1
0
def experiment(variant):
    from simple_sup import SimpleSupEnv
    expl_env = SimpleSupEnv(**variant['env_kwars'])
    eval_env = SimpleSupEnv(**variant['env_kwars'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    hidden_dim = variant['hidden_dim']
    encoder = nn.Sequential(
             nn.Linear(obs_dim,hidden_dim),
             nn.ReLU(),
             nn.Linear(hidden_dim,hidden_dim),
             nn.ReLU(),
            )
    decoder = nn.Linear(hidden_dim, action_dim)
    from layers import ReshapeLayer
    sup_learner = nn.Sequential(
            nn.Linear(hidden_dim, action_dim),
            ReshapeLayer(shape=(1, action_dim)),
        )
    from sup_softmax_policy import SupSoftmaxPolicy
    policy = SupSoftmaxPolicy(encoder, decoder, sup_learner)
    print('parameters: ',np.sum([p.view(-1).shape[0] for p in policy.parameters()]))

    vf = Mlp(
        hidden_sizes=[32],
        input_size=obs_dim,
        output_size=1,
    )
    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy,use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )

    from rlkit.torch.vpg.ppo_sup_online import PPOSupOnlineTrainer
    trainer = PPOSupOnlineTrainer(
        policy=policy,
        value_function=vf,
        vf_criterion=vf_criterion,
        **variant['trainer_kwargs']
    )
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 2
0
def experiment(variant):
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name, **variant['env_kwargs'])
    eval_env = make_env(args.exp_name, **variant['env_kwargs'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n
    label_num = expl_env.label_num
    label_dim = expl_env.label_dim

    encoder = nn.Sequential(
        nn.Linear(obs_dim, 32),
        nn.ReLU(),
        nn.Linear(32, 32),
        nn.ReLU(),
    )
    decoder = nn.Linear(32, action_dim)
    from layers import ReshapeLayer
    sup_learner = nn.Sequential(
        nn.Linear(32, int(label_num * label_dim)),
        ReshapeLayer(shape=(label_num, label_dim)),
    )
    from sup_softmax_policy import SupSoftmaxPolicy
    policy = SupSoftmaxPolicy(encoder, decoder, sup_learner)
    print('parameters: ',
          np.sum([p.view(-1).shape[0] for p in policy.parameters()]))

    vf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=1,
    )
    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = TRPOTrainer(policy=policy,
                          value_function=vf,
                          vf_criterion=vf_criterion,
                          **variant['trainer_kwargs'])
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        log_path_function=get_traffic_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 3
0
def experiment(variant):
    from simple_sup import SimpleSupEnv
    expl_env = SimpleSupEnv(**variant['env_kwars'])
    eval_env = SimpleSupEnv(**variant['env_kwars'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    encoder = nn.Sequential(
        nn.Linear(obs_dim, 16),
        nn.ReLU(),
    )
    decoder = nn.Linear(16, action_dim)
    from layers import ReshapeLayer
    sup_learner = nn.Sequential(
        nn.Linear(16, action_dim),
        ReshapeLayer(shape=(1, action_dim)),
    )
    from sup_softmax_policy import SupSoftmaxPolicy
    policy = SupSoftmaxPolicy(encoder, decoder, sup_learner)

    vf = Mlp(
        hidden_sizes=[32],
        input_size=obs_dim,
        output_size=1,
    )
    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    from sup_replay_buffer import SupReplayBuffer
    replay_buffer = SupReplayBuffer(
        observation_dim=obs_dim,
        label_dim=1,
        max_replay_buffer_size=int(1e6),
    )

    from rlkit.torch.vpg.trpo_sup import TRPOSupTrainer
    trainer = TRPOSupTrainer(policy=policy,
                             value_function=vf,
                             vf_criterion=vf_criterion,
                             replay_buffer=replay_buffer,
                             **variant['trainer_kwargs'])
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 4
0
def experiment(variant):
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name, **variant['env_kwargs'])
    eval_env = make_env(args.exp_name, **variant['env_kwargs'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n
    label_num = expl_env.label_num
    label_dim = expl_env.label_dim

    if variant['load_kwargs']['load']:
        load_dir = variant['load_kwargs']['load_dir']
        load_data = torch.load(load_dir + '/params.pkl', map_location='cpu')
        policy = load_data['trainer/policy']
        vf = load_data['trainer/value_function']
    else:
        from graph_builder_multi import MultiTrafficGraphBuilder
        gb = MultiTrafficGraphBuilder(
            input_dim=4,
            node_num=expl_env.max_veh_num + 1,
            ego_init=torch.tensor([0., 1.]),
            other_init=torch.tensor([1., 0.]),
        )
        if variant['gnn_kwargs']['attention']:
            from gnn_attention_net import GNNAttentionNet
            gnn_class = GNNAttentionNet
        else:
            from gnn_net import GNNNet
            gnn_class = GNNNet
        gnn = gnn_class(
            pre_graph_builder=gb,
            node_dim=variant['gnn_kwargs']['node'],
            conv_type=variant['gnn_kwargs']['conv_type'],
            num_conv_layers=variant['gnn_kwargs']['layer'],
            hidden_activation=variant['gnn_kwargs']['activation'],
        )
        encoder = gnn
        from layers import FlattenLayer, SelectLayer
        decoder = nn.Sequential(
            SelectLayer(1, 0), FlattenLayer(), nn.ReLU(),
            nn.Linear(variant['gnn_kwargs']['node'], action_dim))
        from layers import ReshapeLayer
        sup_learner = nn.Sequential(
            SelectLayer(1, np.arange(1, expl_env.max_veh_num + 1)),
            nn.ReLU(),
            nn.Linear(variant['gnn_kwargs']['node'], label_dim),
        )
        from sup_softmax_policy import SupSoftmaxPolicy
        policy = SupSoftmaxPolicy(encoder, decoder, sup_learner)
        print('parameters: ',
              np.sum([p.view(-1).shape[0] for p in policy.parameters()]))

        vf = Mlp(
            hidden_sizes=[32, 32],
            input_size=obs_dim,
            output_size=1,
        )

    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )

    from sup_replay_buffer import SupReplayBuffer
    replay_buffer = SupReplayBuffer(
        observation_dim=obs_dim,
        label_dim=label_num,
        max_replay_buffer_size=int(1e6),
    )

    from rlkit.torch.vpg.ppo_sup_vanilla import PPOSupVanillaTrainer
    trainer = PPOSupVanillaTrainer(policy=policy,
                                   value_function=vf,
                                   vf_criterion=vf_criterion,
                                   replay_buffer=replay_buffer,
                                   **variant['trainer_kwargs'])
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        log_path_function=get_traffic_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 5
0
def experiment(variant):
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name, **variant['env_kwargs'])
    eval_env = make_env(args.exp_name, **variant['env_kwargs'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n
    label_num = expl_env.label_num
    label_dim = expl_env.label_dim

    if variant['load_kwargs']['load']:
        load_dir = variant['load_kwargs']['load_dir']
        load_data = torch.load(load_dir + '/params.pkl', map_location='cpu')
        policy = load_data['trainer/policy']
        vf = load_data['trainer/value_function']
    else:
        hidden_dim = variant['mlp_kwargs']['hidden']
        encoder = nn.Sequential(
            nn.Linear(obs_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
        )
        decoder = nn.Linear(hidden_dim, action_dim)
        from layers import ReshapeLayer
        sup_learner = nn.Sequential(
            nn.Linear(hidden_dim,
                      int(expl_env.label_num * expl_env.label_dim)),
            ReshapeLayer(shape=(expl_env.label_num, expl_env.label_dim)),
        )
        from sup_softmax_policy import SupSoftmaxPolicy
        policy = SupSoftmaxPolicy(encoder, decoder, sup_learner)
        print('parameters: ',
              np.sum([p.view(-1).shape[0] for p in policy.parameters()]))

        vf = Mlp(
            hidden_sizes=[32, 32],
            input_size=obs_dim,
            output_size=1,
        )

    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )

    from sup_replay_buffer import SupReplayBuffer
    replay_buffer = SupReplayBuffer(
        observation_dim=obs_dim,
        label_dim=label_num,
        max_replay_buffer_size=int(1e6),
    )

    from rlkit.torch.vpg.ppo_sup_vanilla import PPOSupVanillaTrainer
    trainer = PPOSupVanillaTrainer(policy=policy,
                                   value_function=vf,
                                   vf_criterion=vf_criterion,
                                   replay_buffer=replay_buffer,
                                   **variant['trainer_kwargs'])
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        log_path_function=get_traffic_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()