def experiment(variant): expl_env = gym.make('GoalGridworld-v0') eval_env = gym.make('GoalGridworld-v0') obs_dim = expl_env.observation_space.spaces['observation'].low.size goal_dim = expl_env.observation_space.spaces['desired_goal'].low.size action_dim = expl_env.action_space.n qf = FlattenMlp( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) target_qf = FlattenMlp( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) eval_policy = ArgmaxDiscretePolicy(qf) exploration_strategy = EpsilonGreedy(action_space=expl_env.action_space, ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=exploration_strategy, policy=eval_policy, ) replay_buffer = ObsDictRelabelingBuffer(env=eval_env, **variant['replay_buffer_kwargs']) observation_key = 'observation' desired_goal_key = 'desired_goal' eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, expl_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) trainer = DQNTrainer(qf=qf, target_qf=target_qf, **variant['trainer_kwargs']) trainer = HERTrainer(trainer) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from simple_sup import SimpleSupEnv expl_env = SimpleSupEnv(**variant['env_kwars']) eval_env = SimpleSupEnv(**variant['env_kwars']) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n encoder = nn.Sequential( nn.Linear(obs_dim,16), nn.ReLU(), ) decoder = nn.Linear(16, action_dim) from layers import ReshapeLayer sup_learner = nn.Sequential( decoder, ReshapeLayer(shape=(1, action_dim)), ) from sup_softmax_policy import SupSoftmaxPolicy policy = SupSoftmaxPolicy(encoder, decoder, sup_learner) vf = Mlp( hidden_sizes=[], input_size=obs_dim, output_size=1, ) vf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(policy,use_preactivation=True) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) from sup_online import SupOnlineTrainer trainer = SupOnlineTrainer( policy=policy, value_function=vf, vf_criterion=vf_criterion, **variant['trainer_kwargs'] ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): qf = CNN( input_width=obs_dim, input_height=obs_dim, input_channels=channels, output_size=action_dim, kernel_sizes=[8, 4], n_channels=[16, 32], strides=[4, 2], paddings=[0, 0], hidden_sizes=[256], ) target_qf = CNN( input_width=obs_dim, input_height=obs_dim, input_channels=channels, output_size=action_dim, kernel_sizes=[8, 4], n_channels=[16, 32], strides=[4, 2], paddings=[0, 0], hidden_sizes=[256], ) qf_criterion = nn.MSELoss() eval_learner_policy = ArgmaxDiscretePolicy(qf) expl_learner_policy = PolicyWrappedWithExplorationStrategy( AnnealedEpsilonGreedy(symbolic_action_space, anneal_rate=variant["anneal_rate"]), eval_learner_policy, ) eval_policy = LearnPlanPolicy(eval_learner_policy) expl_policy = LearnPlanPolicy(expl_learner_policy) eval_path_collector = MdpPathCollector(eval_env, eval_policy, rollout=hierarchical_rollout) expl_path_collector = MdpPathCollector(expl_env, expl_policy, rollout=hierarchical_rollout) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant["trainer_kwargs"]) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], symb_env) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from rlkit.envs.gym_minigrid.gym_minigrid import envs expl_env = ToolsEnv(**variant['env_kwargs']) eval_env = ToolsEnv(**variant['env_kwargs']) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.n layer_size = variant['algo_kwargs']['layer_size'] lifetime = variant['env_kwargs'].get('time_horizon', 0) == 0 if lifetime: assert eval_env.time_horizon == 0, 'cannot have time horizon for lifetime env' qf = gen_network(variant['algo_kwargs'], action_dim, layer_size) target_qf = gen_network(variant['algo_kwargs'], action_dim, layer_size) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) # eval_policy = SoftmaxQPolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedyDecay(expl_env.action_space, 1e-4, 1, 0.1), eval_policy, ) if lifetime: eval_policy = expl_policy # expl_policy = PolicyWrappedWithExplorationStrategy( # EpsilonGreedy(expl_env.action_space, 0.5), # eval_policy, # ) if eval_env.time_horizon == 0: collector_class = LifetimeMdpPathCollector if lifetime else MdpPathCollector else: collector_class = MdpPathCollectorConfig eval_path_collector = collector_class( eval_env, eval_policy, # render=True ) expl_path_collector = collector_class(expl_env, expl_policy) trainer = DoubleDQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['algo_kwargs']['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['algo_kwargs']['replay_buffer_size'], expl_env) algo_class = TorchLifetimeRLAlgorithm if lifetime else TorchBatchRLAlgorithm algorithm = algo_class(trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # Select a different success_function for different tasks. expl_env = GymCraftingEnv(state_obs=True, few_obj=True, success_function=eval_eatbread) eval_env = GymCraftingEnv(state_obs=True, few_obj=True, success_function=eval_eatbread) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.n qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) target_qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): num_agent = variant['num_agent'] from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=4) eval_env = CartPoleEnv(mode=4) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n qf_n, policy_n, target_qf_n, target_policy_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [] for i in range(num_agent): qf = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, **variant['qf_kwargs']) policy = GumbelSoftmaxMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) qf_n.append(qf) policy_n.append(policy) target_qf_n.append(target_qf) target_policy_n.append(target_policy) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) trainer = MADDPGTrainer(qf_n=qf_n, target_qf_n=target_qf_n, policy_n=policy_n, target_policy_n=target_policy_n, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): """Run the experiment.""" eval_env = gym.make('CartPole-v0') obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n # Collect data. print('Collecting data...') data = [] while len(data) < variant['offline_data_size']: done = False s = eval_env.reset() while not done: a = np.random.randint(action_dim) n, r, done, _ = eval_env.step(a) one_hot_a = np.zeros(action_dim) one_hot_a[a] = 1 data.append((s, one_hot_a, r, n, done)) s = n if len(data) == variant['offline_data_size']: break qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) target_qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) trainer = DQNTrainer( qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs'] ) offline_data = OfflineDataStore(data=data,) algorithm = TorchOfflineRLAlgorithm( trainer=trainer, evaluation_env=eval_env, evaluation_data_collector=eval_path_collector, offline_data=offline_data, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=2) eval_env = CartPoleEnv(mode=2) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n policy = SoftmaxMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) qf1 = Mlp(input_size=obs_dim, output_size=action_dim, **variant['qf_kwargs']) target_qf1 = copy.deepcopy(qf1) qf2 = Mlp(input_size=obs_dim, output_size=action_dim, **variant['qf_kwargs']) target_qf2 = copy.deepcopy(qf2) eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) qf_criterion = nn.MSELoss() trainer = SACDiscreteTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, qf_criterion=qf_criterion, **variant['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys from traffic.make_env import make_env expl_env = make_env(args.exp_name) eval_env = make_env(args.exp_name) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n gb = TrafficGraphBuilder(input_dim=4, ego_init=torch.tensor([0., 1.]), other_init=torch.tensor([1., 0.]), edge_index=torch.tensor([[0, 0, 1, 2], [1, 2, 0, 0]])) qf = GNNNet(pre_graph_builder=gb, node_dim=16, output_dim=action_dim, post_mlp_kwargs=variant['qf_kwargs'], num_conv_layers=3) target_qf = copy.deepcopy(qf) eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space, variant['epsilon']), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) qf_criterion = nn.MSELoss() trainer = DoubleDQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): args = getArgs() # expl_env = NormalizedBoxEnv(environment(args)) expl_env = environment(args, 'dqn') eval_env = environment(args, 'dqn') # expl_env.render() obs_dim = expl_env.get_obsdim() action_dim = expl_env.action_space.n qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) target_qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def __init__(self, env_sampler, qf, policy=None, learning_rate=1e-3, use_hard_updates=False, hard_update_period=1000, tau=0.001, epsilon=0.1, qf_criterion=None, **kwargs): """ :param env: Env. :param qf: QFunction. Maps from state to action Q-values. :param learning_rate: Learning rate for qf. Adam is used. :param use_hard_updates: Use a hard rather than soft update. :param hard_update_period: How many gradient steps before copying the parameters over. Used if `use_hard_updates` is True. :param tau: Soft target tau to update target QF. Used if `use_hard_updates` is False. :param epsilon: Probability of taking a random action. :param kwargs: kwargs to pass onto TorchRLAlgorithm """ self.env_sampler = env_sampler env, _ = env_sampler() exploration_strategy = EpsilonGreedy( action_space=env.action_space, prob_random_action=epsilon, ) self.policy = policy or ArgmaxDiscretePolicy(qf) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=exploration_strategy, policy=self.policy, ) super().__init__(env_sampler, exploration_policy, eval_policy=self.policy, **kwargs) self.qf = qf self.target_qf = self.qf.copy() self.learning_rate = learning_rate self.use_hard_updates = use_hard_updates self.hard_update_period = hard_update_period self.tau = tau self.qf_optimizer = optim.Adam( self.qf.parameters(), lr=self.learning_rate, ) self.qf_criterion = qf_criterion or nn.MSELoss() self.eval_statistics = None
def experiment(variant): import sys from traffic.make_env import make_env expl_env = make_env(args.exp_name) eval_env = make_env(args.exp_name) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n gb = TrafficGraphBuilder(input_dim=4, ego_init=torch.tensor([0., 1.]), other_init=torch.tensor([1., 0.]), edge_index=torch.tensor([[0, 0, 1, 2], [1, 2, 0, 0]])) module = GNNNet(pre_graph_builder=gb, node_dim=16, output_dim=action_dim, post_mlp_kwargs=dict(hidden_sizes=[32]), num_conv_layers=3) policy = SoftmaxPolicy(module, **variant['policy_kwargs']) vf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=1, ) vf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = PPOTrainer(policy=policy, value_function=vf, vf_criterion=vf_criterion, **variant['trainer_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys from traffic.make_env import make_env expl_env = make_env(args.exp_name) eval_env = make_env(args.exp_name) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n qf = Mlp( input_size=obs_dim, output_size=action_dim, **variant['qf_kwargs'] ) target_qf = copy.deepcopy(qf) eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space, variant['epsilon']), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) qf_criterion = nn.MSELoss() trainer = DoubleDQNTrainer( qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs'] ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function = get_traffic_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = gym.make('CartPole-v0') eval_env = gym.make('CartPole-v0') obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.n qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) target_qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=2) eval_env = CartPoleEnv(mode=2) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n # import gym # expl_env = gym.make('CartPole-v0') # eval_env = gym.make('CartPole-v0') # obs_dim = eval_env.observation_space.low.size # action_dim = eval_env.action_space.n policy = SoftmaxMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) vf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=1, ) vf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = VPGTrainer(policy=policy, value_function=vf, vf_criterion=vf_criterion, **variant['trainer_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=2) eval_env = CartPoleEnv(mode=2) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n qf = Mlp(input_size=obs_dim, output_size=action_dim, **variant['qf_kwargs']) target_qf = copy.deepcopy(qf) eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space, variant['epsilon']), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) replay_buffer = PrioritizedReplayBuffer( variant['replay_buffer_size'], expl_env, ) qf_criterion = nn.MSELoss() trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, replay_buffer=replay_buffer, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=True,discrete_action_input=True)) eval_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=True,discrete_action_input=True)) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [], [] for i in range(num_agent): policy = SoftmaxMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) qf1 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*(num_agent-1)), output_size=action_dim, **variant['qf_kwargs'] ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*(num_agent-1)), output_size=action_dim, **variant['qf_kwargs'] ) target_qf2 = copy.deepcopy(qf2) eval_policy = ArgmaxDiscretePolicy(policy) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) policy_n.append(policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) trainer = MASACDiscreteTrainer( env = expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
args = parser.parse_args() pre_dir = './Data/' + args.exp_name + args.extra_name import os data_path = '{}/{}/seed{}_load/{}.pkl'.format(pre_dir, args.log_dir, args.seed, args.file) if os.path.exists(data_path): print('_load') else: data_path = '{}/{}/seed{}/{}.pkl'.format(pre_dir, args.log_dir, args.seed, args.file) data = torch.load(data_path, map_location='cpu') if 'trainer/qf' in data.keys(): qf = data['trainer/qf'] eval_policy = ArgmaxDiscretePolicy(qf) else: policy = data['trainer/policy'] if isinstance(policy, SoftmaxPolicy)\ or isinstance(policy, SupSoftmaxPolicy)\ or isinstance(policy, SupSepSoftmaxPolicy): eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True) elif isinstance(policy, TanhGaussianPolicy): eval_policy = MakeDeterministic(policy) if 'trainer/sup_learner' in data.keys(): sup_learner = data['trainer/sup_learner'] else: sup_learner = None import sys
def experiment(variant): setup_logger("name-of-experiment", variant=variant) ptu.set_gpu_mode(True) expl_env = gym.make(variant["env_name"]) eval_env = gym.make(variant["env_name"]) # OLD - Taxi image env # if isinstance(expl_env.observation_space, Json): # expl_env = BoxWrapper(expl_env) # eval_env = BoxWrapper(eval_env) # # obs_shape = expl_env.observation_space.image.shape # obs_shape = expl_env.observation_space.shape # if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: # convert WxHxC into CxWxH # expl_env = TransposeImage(expl_env, op=[2, 0, 1]) # eval_env = TransposeImage(eval_env, op=[2, 0, 1]) # obs_shape = expl_env.observation_space.shape # channels, obs_width, obs_height = obs_shape # action_dim = eval_env.action_space.n # qf = CNN( # input_width=obs_width, # input_height=obs_height, # input_channels=channels, # output_size=action_dim, # kernel_sizes=[8, 4], # n_channels=[16, 32], # strides=[4, 2], # paddings=[0, 0], # hidden_sizes=[256], # ) # target_qf = CNN( # input_width=obs_width, # input_height=obs_height, # input_channels=channels, # output_size=action_dim, # kernel_sizes=[8, 4], # n_channels=[16, 32], # strides=[4, 2], # paddings=[0, 0], # hidden_sizes=[256], # ) ( obs_shape, obs_space, action_space, n, mlp, channels, fc_input, ) = common.get_spaces(expl_env) qf = Mlp( input_size=n, output_size=action_space.n, hidden_sizes=[256, 256], init_w=variant["init_w"], b_init_value=variant["b_init_value"], ) target_qf = Mlp( input_size=n, output_size=action_space.n, hidden_sizes=[256, 256], init_w=variant["init_w"], b_init_value=variant["b_init_value"], ) qf_criterion = nn.MSELoss() if variant["softmax"]: eval_policy = SoftmaxDiscretePolicy(qf, variant["temperature"]) else: eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( LinearEpsilonGreedy(action_space, anneal_schedule=variant["anneal_schedule"]), eval_policy, ) eval_path_collector = MdpPathCollector(eval_env, eval_policy, render=variant["render"]) expl_path_collector = MdpPathCollector(expl_env, expl_policy, render=variant["render"]) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant["trainer_kwargs"]) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from simple_sup import SimpleSupEnv expl_env = SimpleSupEnv(**variant['env_kwars']) eval_env = SimpleSupEnv(**variant['env_kwars']) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n hidden_dim = variant['hidden_dim'] encoder = nn.Sequential( nn.Linear(obs_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), ) decoder = nn.Linear(hidden_dim, action_dim) from layers import ReshapeLayer sup_learner = nn.Sequential( decoder, ReshapeLayer(shape=(1, action_dim)), ) from sup_softmax_policy import SupSoftmaxPolicy policy = SupSoftmaxPolicy(encoder, decoder, sup_learner) print('parameters: ', np.sum([p.view(-1).shape[0] for p in policy.parameters()])) vf = Mlp( hidden_sizes=[], input_size=obs_dim, output_size=1, ) vf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) from sup_replay_buffer import SupReplayBuffer replay_buffer = SupReplayBuffer( observation_dim=obs_dim, label_dim=1, max_replay_buffer_size=int(1e6), ) from sup import SupTrainer trainer = SupTrainer(policy=policy, value_function=vf, vf_criterion=vf_criterion, replay_buffer=replay_buffer, **variant['trainer_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
d_path = pre_dir + '/' + P_paths[pid] + '/seed' + str(seed) if args.epoch: d_path += '/itr_{}.pkl'.format(args.epoch) else: d_path += '/params.pkl' data = torch.load(d_path, map_location='cpu') policy_n = data['trainer/policy_n'] if not args.rand: print('make_deterministic') if isinstance(policy_n[0], TanhGaussianPolicy): policy_n = [ MakeDeterministic(policy) for policy in policy_n ] elif isinstance(policy_n[0], GumbelSoftmaxMlpPolicy): policy_n = [ ArgmaxDiscretePolicy(policy, use_preactivation=True) for policy in policy_n ] players.append(policy_n) for p1id in range(len(P_paths)): for p2id in range(len(P_paths)): pair_name = '{}-{}'.format(P_paths[p1id], P_paths[p2id]) # print(pair_name) if (pair_name in results[seed].keys()) and (not args.new): print('pass') Cr1 = results[seed][pair_name]['r1'] Cr2 = results[seed][pair_name]['r2'] print('{}: r1: {:.2f}; r2: {:.2f}'.format( pair_name, np.mean(Cr1), np.mean(Cr2))) else:
def experiment(variant): from traffic.make_env import make_env expl_env = make_env(args.exp_name, **variant['env_kwargs']) eval_env = make_env(args.exp_name, **variant['env_kwargs']) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n label_num = expl_env.label_num label_dim = expl_env.label_dim from graph_builder_multi import MultiTrafficGraphBuilder policy_gb = MultiTrafficGraphBuilder( input_dim=4 + label_dim, node_num=expl_env.max_veh_num + 1, ego_init=torch.tensor([0., 1.]), other_init=torch.tensor([1., 0.]), ) if variant['gnn_kwargs']['attention']: from gnn_attention_net import GNNAttentionNet gnn_class = GNNAttentionNet else: from gnn_net import GNNNet gnn_class = GNNNet policy_gnn = gnn_class( pre_graph_builder=policy_gb, node_dim=variant['gnn_kwargs']['node'], num_conv_layers=variant['gnn_kwargs']['layer'], hidden_activation=variant['gnn_kwargs']['activation'], ) from layers import FlattenLayer, SelectLayer policy = nn.Sequential( policy_gnn, SelectLayer(1, 0), FlattenLayer(), nn.ReLU(), nn.Linear(variant['gnn_kwargs']['node'], action_dim)) sup_gb = MultiTrafficGraphBuilder( input_dim=4, node_num=expl_env.max_veh_num + 1, ego_init=torch.tensor([0., 1.]), other_init=torch.tensor([1., 0.]), ) sup_attentioner = None from layers import ReshapeLayer from gnn_net import GNNNet sup_gnn = GNNNet( pre_graph_builder=sup_gb, node_dim=variant['gnn_kwargs']['node'], num_conv_layers=variant['gnn_kwargs']['layer'], hidden_activation=variant['gnn_kwargs']['activation'], ) sup_learner = nn.Sequential( sup_gnn, SelectLayer(1, np.arange(1, expl_env.max_veh_num + 1)), nn.ReLU(), nn.Linear(variant['gnn_kwargs']['node'], label_dim), ) from sup_sep_softmax_policy import SupSepSoftmaxPolicy policy = SupSepSoftmaxPolicy(policy, sup_learner, label_num, label_dim) print('parameters: ', np.sum([p.view(-1).shape[0] for p in policy.parameters()])) vf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=1, ) vf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) from sup_sep_rollout import sup_sep_rollout expl_path_collector = MdpPathCollector( expl_env, expl_policy, rollout_fn=sup_sep_rollout, ) from sup_replay_buffer import SupReplayBuffer replay_buffer = SupReplayBuffer( observation_dim=obs_dim, label_dim=label_num, max_replay_buffer_size=int(1e6), ) from rlkit.torch.vpg.trpo_sup_sep import TRPOSupSepTrainer trainer = TRPOSupSepTrainer(policy=policy, value_function=vf, vf_criterion=vf_criterion, replay_buffer=replay_buffer, **variant['trainer_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, log_path_function=get_traffic_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # common.initialise(variant) setup_logger("name-of-experiment", variant=variant) ptu.set_gpu_mode(True) expl_env = gym.make(variant["env_name"], seed=5) eval_env = gym.make(variant["env_name"], seed=5) ANCILLARY_GOAL_SIZE = 16 SYMBOLIC_ACTION_SIZE = ( 12 ) # Size of embedding (ufva/multihead) for goal space direction to controller GRID_SIZE = 31 action_dim = ANCILLARY_GOAL_SIZE symbolic_action_space = gym.spaces.Discrete(ANCILLARY_GOAL_SIZE) symb_env = gym.make(variant["env_name"]) symb_env.action_space = symbolic_action_space ( obs_shape, obs_space, action_space, n, mlp, channels, fc_input, ) = common.get_spaces(expl_env) qf = Mlp( input_size=n, output_size=action_dim, hidden_sizes=[256, 256], init_w=variant["init_w"], b_init_value=variant["b_init_value"], ) target_qf = Mlp( input_size=n, output_size=action_dim, hidden_sizes=[256, 256], init_w=variant["init_w"], b_init_value=variant["b_init_value"], ) planner = ENHSPPlanner() # collect filepath = "/home/achester/anaconda3/envs/goal-gen/.guild/runs/e77c75eed02e4b38a0a308789fbfcbd8/data/params.pkl" # collect with (open(filepath, "rb")) as openfile: while True: try: policies = pickle.load(openfile) except EOFError: break loaded_collect_policy = policies["exploration/policy"] loaded_collect_policy.rnn_hxs = loaded_collect_policy.rnn_hxs[0].unsqueeze( 0) eval_collect = CraftController(loaded_collect_policy, n=GRID_SIZE) expl_collect = CraftController(loaded_collect_policy, n=GRID_SIZE) # other # filepath = "/home/achester/anaconda3/envs/goal-gen/.guild/runs/cf5c31afe0724acd8f6398d77a80443e/data/params.pkl" # other (RC 28) filepath = "/home/achester/anaconda3/envs/goal-gen/.guild/runs/4989f4bcbadb4ac58c3668c068d63225/data/params.pkl" # other (RC 55) # filepath = "/home/achester/Documents/misc/craft-model/params.pkl" with (open(filepath, "rb")) as openfile: while True: try: policies = pickle.load(openfile) except EOFError: break loaded_other_policy = policies["exploration/policy"] loaded_other_policy.rnn_hxs = loaded_other_policy.rnn_hxs[0].unsqueeze(0) eval_other = CraftController(loaded_other_policy, n=GRID_SIZE) expl_other = CraftController(loaded_other_policy, n=GRID_SIZE) eval_controller = PretrainedController([eval_collect, eval_other]) expl_controller = PretrainedController([expl_collect, expl_other]) function_env = gym.make(variant["env_name"]) qf_criterion = nn.MSELoss() if variant["softmax"]: eval_learner = SoftmaxDiscretePolicy(qf, variant["temperature"]) else: eval_learner = ArgmaxDiscretePolicy(qf) expl_learner = PolicyWrappedWithExplorationStrategy( LinearEpsilonGreedy(symbolic_action_space, anneal_schedule=variant["anneal_schedule"]), eval_learner, ) eval_policy = LearnPlanPolicy( eval_learner, planner, eval_controller, num_processes=1, vectorised=False, env=function_env, ) expl_policy = LearnPlanPolicy( expl_learner, planner, expl_controller, num_processes=1, vectorised=False, env=function_env, ) eval_path_collector = IntermediatePathCollector( eval_env, eval_policy, rollout=intermediate_rollout, gamma=1, render=variant["render"], single_plan_discounting=variant["trainer_kwargs"] ["single_plan_discounting"], experience_interval=variant["experience_interval"], ) expl_path_collector = IntermediatePathCollector( expl_env, expl_policy, rollout=intermediate_rollout, gamma=variant["trainer_kwargs"]["discount"], render=variant["render"], single_plan_discounting=variant["trainer_kwargs"] ["single_plan_discounting"], experience_interval=variant["experience_interval"], ) if variant["double_dqn"]: trainer = DoubleDQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant["trainer_kwargs"]) else: trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant["trainer_kwargs"]) replay_buffer = PlanReplayBuffer(variant["replay_buffer_size"], symb_env) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
with torch.no_grad(): players = [] for pid in range(len(P_paths)): d_path = pre_dir+'/'+P_paths[pid]+'/seed'+str(seed) if args.epoch: d_path += '/itr_{}.pkl'.format(args.epoch) else: d_path += '/params.pkl' data = torch.load(d_path,map_location='cpu') policy_n = data['trainer/policy_n'] if not args.rand: print('make_deterministic') if isinstance(policy_n[0],TanhGaussianPolicy): policy_n = [MakeDeterministic(policy) for policy in policy_n] elif isinstance(policy_n[0],GumbelSoftmaxMlpPolicy): policy_n = [ArgmaxDiscretePolicy(policy,use_preactivation=True) for policy in policy_n] players.append(policy_n) for p1id in range(len(P_paths)): for p2id in range(len(P_paths)): pair_name = '{}-{}'.format(P_paths[p1id],P_paths[p2id]) # print(pair_name) if (pair_name in results[seed].keys()) and (not args.new): print('pass') Cr1 = results[seed][pair_name]['r1'] Cr2 = results[seed][pair_name]['r2'] print('{}: r1: {:.2f}; r2: {:.2f}'.format(pair_name,np.mean(Cr1),np.mean(Cr2))) else: results[seed][pair_name] = {} player1 = players[p1id] player2 = players[p2id]
def experiment(variant): from traffic.make_env import make_env expl_env = make_env(args.exp_name, **variant['env_kwargs']) eval_env = make_env(args.exp_name, **variant['env_kwargs']) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n encoders = [] mlp = Mlp( input_size=obs_dim, output_size=32, hidden_sizes=[ 32, ], ) encoders.append(mlp) sup_learners = [] for i in range(2): mlp = Mlp( input_size=obs_dim, output_size=2, hidden_sizes=[ 32, ], ) sup_learner = SoftmaxPolicy(mlp, learn_temperature=False) sup_learners.append(sup_learner) encoders.append(sup_learner) decoder = Mlp( input_size=int(32 + 2 * 2), output_size=action_dim, hidden_sizes=[], ) module = CombineNet( encoders=encoders, decoder=decoder, no_gradient=variant['no_gradient'], ) policy = SoftmaxPolicy(module, **variant['policy_kwargs']) vf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=1, ) vf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) from sup_replay_buffer import SupReplayBuffer replay_buffer = SupReplayBuffer( observation_dim=obs_dim, label_dims=[1, 1], max_replay_buffer_size=int(1e6), ) from rlkit.torch.vpg.ppo_sup import PPOSupTrainer trainer = PPOSupTrainer(policy=policy, value_function=vf, vf_criterion=vf_criterion, sup_learners=sup_learners, replay_buffer=replay_buffer, **variant['trainer_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, log_path_function=get_traffic_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(doodad_config, variant): from rlkit.core import logger from rlkit.launchers.launcher_util import setup_logger print ("doodad_config.base_log_dir: ", doodad_config.base_log_dir) from datetime import datetime timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S_%f') setup_logger('wrapped_'+variant['env'], variant=variant, log_dir=doodad_config.base_log_dir+"/smirl/"+variant['exp_name']+"/"+timestamp+"/") if (variant["log_comet"]): try: comet_logger = Experiment(api_key=launchers.config.COMET_API_KEY, project_name=launchers.config.COMET_PROJECT_NAME, workspace=launchers.config.COMET_WORKSPACE) logger.set_comet_logger(comet_logger) comet_logger.set_name(str(variant['env'])+"_"+str(variant['exp_name'])) print("variant: ", variant) variant['comet_key'] = comet_logger.get_key() comet_logger.log_parameters(variant) print(comet_logger) except Exception as inst: print ("Not tracking training via commet.ml") print ("Error: ", inst) import gym from torch import nn as nn import rlkit.torch.pytorch_util as ptu import torch from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy from rlkit.exploration_strategies.base import \ PolicyWrappedWithExplorationStrategy from rlkit.policies.argmax import ArgmaxDiscretePolicy from rlkit.torch.dqn.dqn import DQNTrainer from rlkit.data_management.env_replay_buffer import EnvReplayBuffer from rlkit.samplers.data_collector import MdpPathCollector from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm from surprise.utils.rendering_algorithm import TorchBatchRLRenderAlgorithm from surprise.envs.tetris.tetris import TetrisEnv from surprise.wrappers.obsresize import ResizeObservationWrapper, RenderingObservationWrapper, SoftResetWrapper import pdb base_env = get_env(variant) base_env2 = get_env(variant) print ("GPU_BUS_Index", variant["GPU_BUS_Index"]) if torch.cuda.is_available() and doodad_config.use_gpu: print ("Using the GPU for learning") # ptu.set_gpu_mode(True, gpu_id=doodad_config.gpu_id) ptu.set_gpu_mode(True, gpu_id=variant["GPU_BUS_Index"]) else: print ("NOT Using the GPU for learning") # base_env2 = RenderingObservationWrapper(base_env2) expl_env, network = add_wrappers(base_env, variant, device=ptu.device) eval_env, _ = add_wrappers(base_env2, variant, device=ptu.device, eval=True, network=network) if ("vae_wrapper" in variant["wrappers"]): eval_env._network = base_env._network obs_dim = expl_env.observation_space.low.shape print("Final obs dim", obs_dim) action_dim = eval_env.action_space.n print("Action dimension: ", action_dim) qf, target_qf = get_network(variant["network_args"], obs_dim, action_dim) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) if "prob_random_action" in variant: expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space, prob_random_action=variant["prob_random_action"], prob_end=variant["prob_end"], steps=variant["steps"]), eval_policy, ) else: expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space, prob_random_action=0.8, prob_end=0.05), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, render_kwargs=variant['render_kwargs'] ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = DQNTrainer( qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs'] ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLRenderAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): fov, delta, num_ch = 13, 3, 3 expl_env = EnvBrainbow('0:data/brainbow/training_sample.tif', coord_interval=2, img_mean=128, img_stddev=33, num_ch=3, fov=fov, delta=delta, seed=0) eval_env = EnvBrainbow('0:data/brainbow/training_sample.tif', coord_interval=2, img_mean=128, img_stddev=33, num_ch=3, fov=fov, delta=delta, seed=0) obs_dim = expl_env.observation_space.low.shape # 13, 13, 3 action_dim = eval_env.action_space.n # 2 kernel_sizes = [3, 3, 3] n_channels = [32, 64, 64] strides = [1, 1, 1] paddings = [0, 0, 0] hidden_sizes = [512] qf = CNN( input_width=fov, input_height=fov, input_channels=num_ch, output_size=action_dim, kernel_sizes=kernel_sizes, n_channels=n_channels, strides=strides, paddings=paddings, hidden_sizes=hidden_sizes, batch_norm_conv=True, batch_norm_fc=False ) target_qf = CNN( input_width=fov, input_height=fov, input_channels=num_ch, output_size=action_dim, kernel_sizes=kernel_sizes, n_channels=n_channels, strides=strides, paddings=paddings, hidden_sizes=hidden_sizes, batch_norm_conv=True, batch_norm_fc=False ) print(qf) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = DQNTrainer( qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs'] ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys from traffic.make_env import make_env expl_env = make_env(args.exp_name,**variant['env_kwargs']) eval_env = make_env(args.exp_name,**variant['env_kwargs']) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n from graph_builder_multi import MultiTrafficGraphBuilder gb = MultiTrafficGraphBuilder(input_dim=4, node_num=expl_env.max_veh_num+1, ego_init=torch.tensor([0.,1.]), other_init=torch.tensor([1.,0.]), ) from gnn_net import GNNNet gnn = GNNNet( pre_graph_builder = gb, node_dim = 16, num_conv_layers=3) from layers import SelectLayer encoders = [] encoders.append(nn.Sequential(gnn,SelectLayer(1,0),nn.ReLU())) sup_learners = [] for i in range(expl_env.max_veh_num): sup_learner = nn.Sequential( gnn, SelectLayer(1,i+1), nn.ReLU(), nn.Linear(16, 2), ) sup_learner = SoftmaxPolicy(sup_learner, learn_temperature=False) sup_learners.append(sup_learner) encoders.append(sup_learner) decoder = Mlp(input_size=int(16+2*expl_env.max_veh_num), output_size=action_dim, hidden_sizes=[], ) from layers import ConcatLayer need_gradients = np.array([True]*len(encoders)) if variant['no_gradient']: need_gradients[1:] = False policy = nn.Sequential( ConcatLayer(encoders, need_gradients=list(need_gradients), dim=1), decoder, ) policy = SoftmaxPolicy(policy, learn_temperature=False) vf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=1, ) vf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(policy,use_preactivation=True) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) from sup_replay_buffer import SupReplayBuffer replay_buffer = SupReplayBuffer( observation_dim = obs_dim, label_dims = [1]*expl_env.max_veh_num, max_replay_buffer_size = int(1e6), ) from rlkit.torch.vpg.ppo_sup import PPOSupTrainer trainer = PPOSupTrainer( policy=policy, value_function=vf, vf_criterion=vf_criterion, sup_learners=sup_learners, replay_buffer=replay_buffer, **variant['trainer_kwargs'] ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, log_path_function = get_traffic_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): num_agent = variant['num_agent'] from rlkit.envs.zmq_env import ZMQEnv expl_env = ZMQEnv(variant['port']) eval_env = expl_env obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n policy_n, target_policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [], [], [] for i in range(num_agent): policy = SoftmaxMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_policy = copy.deepcopy(policy) qf1 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * (num_agent - 1)), output_size=action_dim, **variant['qf_kwargs']) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * (num_agent - 1)), output_size=action_dim, **variant['qf_kwargs']) target_qf2 = copy.deepcopy(qf2) eval_policy = ArgmaxDiscretePolicy(policy) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) policy_n.append(policy) target_policy_n.append(target_policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) trainer = PRGDiscreteTrainer(env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, target_policy_n=target_policy_n, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from traffic.make_env import make_env expl_env = make_env(args.exp_name,**variant['env_kwargs']) eval_env = make_env(args.exp_name,**variant['env_kwargs']) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n label_num = expl_env.label_num label_dim = expl_env.label_dim encoder = nn.Sequential( nn.Linear(obs_dim,32), nn.ReLU(), nn.Linear(32,32), nn.ReLU(), ) decoder = nn.Linear(32, action_dim) from layers import ReshapeLayer sup_learner = nn.Sequential( nn.Linear(32, int(label_num*label_dim)), ReshapeLayer(shape=(label_num, label_dim)), ) from sup_softmax_policy import SupSoftmaxPolicy policy = SupSoftmaxPolicy(encoder, decoder, sup_learner) print('parameters: ',np.sum([p.view(-1).shape[0] for p in policy.parameters()])) vf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=1, ) vf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(policy,use_preactivation=True) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) from sup_replay_buffer import SupReplayBuffer replay_buffer = SupReplayBuffer( observation_dim = obs_dim, label_dim = label_num, max_replay_buffer_size = int(1e6), ) from rlkit.torch.vpg.trpo_sup import TRPOSupTrainer trainer = TRPOSupTrainer( policy=policy, value_function=vf, vf_criterion=vf_criterion, replay_buffer=replay_buffer, **variant['trainer_kwargs'] ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, log_path_function = get_traffic_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()