def experiment(variant, data): # make new env, reloading with data['evaluation/env'] seems to make bug eval_env = gym.make("panda-v0", **{"headless": variant["headless"]}) eval_env.seed(variant['seed']) expl_env = eval_env qf1 = data['trainer/qf1'] qf2 = data['trainer/qf2'] target_qf1 = data['trainer/target_qf1'] target_qf2 = data['trainer/target_qf2'] policy = data['trainer/policy'] eval_policy = data["evaluation/policy"] eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = CustomMDPPathCollector(eval_env, ) buffer_filename = None if variant['buffer_filename'] is not None: buffer_filename = variant['buffer_filename'] replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) if variant['load_buffer'] and buffer_filename is not None: replay_buffer.load_buffer(buffer_filename) else: dataset = get_dataset(variant["h5path"], eval_env) load_hdf5(d4rl.qlearning_dataset(eval_env, dataset), replay_buffer) trainer = CQLTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, batch_rl=variant['load_buffer'], **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train(start_epoch=variant["start_epoch"])
def experiment(variant): expl_env = gym.make("CartPole-v0") eval_env = gym.make("CartPole-v0") obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.n qf = Mlp(hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim) target_qf = Mlp(hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy) eval_path_collector = MdpPathCollector(eval_env, eval_policy) expl_path_collector = MdpPathCollector(expl_env, expl_policy) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant["trainer_kwargs"]) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from rlkit.envs.gym_minigrid.gym_minigrid import envs expl_env = ToolsEnv(**variant['env_kwargs']) eval_env = ToolsEnv(**variant['env_kwargs']) rollout_env = ToolsEnv(**variant['env_kwargs']) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.n layer_size = variant['algo_kwargs']['layer_size'] lifetime = variant['env_kwargs'].get('time_horizon', 0) == 0 if lifetime: assert eval_env.time_horizon == 0, 'cannot have time horizon for lifetime env' qf = gen_network(variant['algo_kwargs'], action_dim, layer_size) target_qf = gen_network(variant['algo_kwargs'], action_dim, layer_size) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) # eval_policy = SoftmaxQPolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedyDecay(expl_env.action_space, 1e-5, 1, 0.1), eval_policy, ) if lifetime: eval_policy = expl_policy # expl_policy = PolicyWrappedWithExplorationStrategy( # EpsilonGreedy(expl_env.action_space, 0.5), # eval_policy, # ) if eval_env.time_horizon == 0: collector_class = LifetimeMdpPathCollector if lifetime else MdpPathCollector else: collector_class = MdpPathCollectorConfig eval_path_collector = collector_class( eval_env, eval_policy, # render=True ) expl_path_collector = collector_class(expl_env, expl_policy) trainer = DoubleDQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['algo_kwargs']['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['algo_kwargs']['replay_buffer_size'], expl_env) #algo_class = TorchLifetimeRLAlgorithm if lifetime else TorchBatchRLAlgorithm algo_class = TorchHumanInputLifetimeRLAlgorithm algorithm = algo_class(trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, rollout_env=rollout_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys from traffic.make_env import make_env expl_env = make_env(args.exp_name) eval_env = make_env(args.exp_name) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n module = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) policy = SoftmaxPolicy(module, **variant['policy_kwargs']) qf1 = Mlp(input_size=obs_dim, output_size=action_dim, **variant['qf_kwargs']) target_qf1 = copy.deepcopy(qf1) qf2 = Mlp(input_size=obs_dim, output_size=action_dim, **variant['qf_kwargs']) target_qf2 = copy.deepcopy(qf2) eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) qf_criterion = nn.MSELoss() trainer = SACDiscreteTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, qf_criterion=qf_criterion, **variant['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): ''' 1. 建立实验环境(eval, expl) 2. 确立输入,输出维度,建立qf函数,policy函数 3. 复制target qf和 target policy 函数 4. 对于评估构建path collector 5. 对于训练实验,构建探索策略、path collector、replay buffer 6. 构建 DDPGTrainer (qf, policy) 7. algorithm (包括trainer, env, replay buffer, path collector.以及用于评价部分) 8. 开始训练 :param variant: config parameter :return: ''' eval_env = NormalizedBoxEnv(HalfCheetahEnv()) expl_env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) # 利用copy target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) # 评估 eval_path_collector = MdpPathCollector(eval_env, policy) # 实验 (探索策略、path收集、replay buffer) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) # 转化变量格式 algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = Point2DEnv(**variant['env_kwargs']) env = FlatGoalEnv(env) env = NormalizedBoxEnv(env) action_dim = int(np.prod(env.action_space.shape)) obs_dim = int(np.prod(env.observation_space.shape)) qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_env = expl_env = env eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = TwinSACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, data_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): qf = CNN( input_width=obs_dim, input_height=obs_dim, input_channels=channels, output_size=action_dim, kernel_sizes=[8, 4], n_channels=[16, 32], strides=[4, 2], paddings=[0, 0], hidden_sizes=[256], ) target_qf = CNN( input_width=obs_dim, input_height=obs_dim, input_channels=channels, output_size=action_dim, kernel_sizes=[8, 4], n_channels=[16, 32], strides=[4, 2], paddings=[0, 0], hidden_sizes=[256], ) qf_criterion = nn.MSELoss() eval_learner_policy = ArgmaxDiscretePolicy(qf) expl_learner_policy = PolicyWrappedWithExplorationStrategy( AnnealedEpsilonGreedy(symbolic_action_space, anneal_rate=variant["anneal_rate"]), eval_learner_policy, ) eval_policy = LearnPlanPolicy(eval_learner_policy) expl_policy = LearnPlanPolicy(expl_learner_policy) eval_path_collector = MdpPathCollector(eval_env, eval_policy, rollout=hierarchical_rollout) expl_path_collector = MdpPathCollector(expl_env, expl_policy, rollout=hierarchical_rollout) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant["trainer_kwargs"]) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], symb_env) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # Select a different success_function for different tasks. expl_env = GymCraftingEnv(state_obs=True, few_obj=True, success_function=eval_eatbread) eval_env = GymCraftingEnv(state_obs=True, few_obj=True, success_function=eval_eatbread) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.n qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) target_qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = get_env() eval_env = get_env() post_epoch_funcs = [] M = variant['layer_size'] trainer = get_sac_model(env=eval_env, hidden_sizes=[M, M]) policy = trainer.policy eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) columns = ['Epoch', 'mean', 'std'] eval_result = pd.DataFrame(columns=columns) eval_output_csv = os.path.join(variant['log_dir'], 'eval_result.csv') def post_epoch_func(self, epoch): nonlocal eval_result nonlocal policy print(f'-------------post_epoch_func start-------------') eval_result = my_eval_policy( env=get_env(), algorithm=self, epoch=epoch, eval_result=eval_result, output_csv=eval_output_csv, ) print(f'-------------post_epoch_func done-------------') algorithm.post_epoch_funcs = [ post_epoch_func, ] algorithm.to(ptu.device) algorithm.train()
def run_sac(base_expl_env, base_eval_env, variant): expl_env = FlatGoalEnv(base_expl_env, append_goal_to_obs=True) eval_env = FlatGoalEnv(base_eval_env, append_goal_to_obs=True) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant["layer_size"] num_hidden = variant["num_hidden_layers"] qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M] * num_hidden) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant["replay_buffer_size"], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant["trainer_kwargs"]) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.train()
def experiment(variant): import sys from traffic.make_env import make_env expl_env = make_env(args.exp_name) eval_env = make_env(args.exp_name) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n gb = TrafficGraphBuilder(input_dim=4, ego_init=torch.tensor([0., 1.]), other_init=torch.tensor([1., 0.]), edge_index=torch.tensor([[0, 0, 1, 2], [1, 2, 0, 0]])) qf = GNNNet(pre_graph_builder=gb, node_dim=16, output_dim=action_dim, post_mlp_kwargs=variant['qf_kwargs'], num_conv_layers=3) target_qf = copy.deepcopy(qf) eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space, variant['epsilon']), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) qf_criterion = nn.MSELoss() trainer = DoubleDQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): args = getArgs() # expl_env = NormalizedBoxEnv(environment(args)) expl_env = environment(args, 'dqn') eval_env = environment(args, 'dqn') # expl_env.render() obs_dim = expl_env.get_obsdim() action_dim = expl_env.action_space.n qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) target_qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys from traffic.make_env import make_env expl_env = make_env(args.exp_name) eval_env = make_env(args.exp_name) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n qf = Mlp( input_size=obs_dim, output_size=action_dim, **variant['qf_kwargs'] ) target_qf = copy.deepcopy(qf) eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space, variant['epsilon']), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) qf_criterion = nn.MSELoss() trainer = DoubleDQNTrainer( qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs'] ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function = get_traffic_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = NormalizedBoxEnv(HalfCheetahEnv()) eval_env = NormalizedBoxEnv(HalfCheetahEnv()) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant["policy_kwargs"]) target_policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant["policy_kwargs"]) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy) eval_path_collector = MdpPathCollector(eval_env, policy) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant["trainer_kwargs"]) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def get_replay_buffer(variant, expl_env): """ Define replay buffer specific to the mode """ mode = variant["mode"] if mode in ["vanilla", "icm"]: replay_buffer = EnvReplayBuffer( env=expl_env, **variant["replay_buffer_kwargs"], ) elif mode in ["her", "her+icm"]: replay_buffer = ObsDictRelabelingBuffer( env=expl_env, **variant["her"], **variant["replay_buffer_kwargs"] ) return replay_buffer
def experiment(variant): from cartpole import CartPoleEnv from rlkit.envs.wrappers import ProbDiscreteEnv expl_env = ProbDiscreteEnv(CartPoleEnv(mode=2)) eval_env = ProbDiscreteEnv(CartPoleEnv(mode=2)) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size # import gym # from rlkit.envs.wrappers import ProbDiscreteEnv # expl_env = ProbDiscreteEnv(gym.make('CartPole-v0')) # eval_env = ProbDiscreteEnv(gym.make('CartPole-v0')) # obs_dim = eval_env.observation_space.low.size # action_dim = eval_env.action_space.low.size qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = SoftmaxMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) eval_path_collector = MdpPathCollector(eval_env, policy) # remove this since need action to be a prob # exploration_policy = PolicyWrappedWithExplorationStrategy( # exploration_strategy=OUStrategy(action_space=expl_env.action_space), # policy=policy, # ) exploration_policy = policy expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): eval_env = NormalizedBoxEnv(HalfCheetahEnv()) expl_env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) eval_path_collector = MdpPathCollector(eval_env, policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer( qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from cartpole import CartPoleEnv expl_env = NormalizedBoxEnv(CartPoleEnv(mode=0)) eval_env = NormalizedBoxEnv(CartPoleEnv(mode=0)) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) eval_path_collector = MdpPathCollector(eval_env, policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer( qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = NormalizedBoxEnv(HalfCheetahEnv()) eval_env = NormalizedBoxEnv(HalfCheetahEnv()) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant["layer_size"] qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M]) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M]) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M]) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M]) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M]) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector(eval_env, eval_policy) expl_path_collector = MdpPathCollector(expl_env, policy) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant["trainer_kwargs"]) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = gym.make('RLkitUR-v0')._start_ros_services() eval_env = gym.make('RLkitUR-v0') expl_env = gym.make('RLkitUR-v0') eval_env = NormalizedBoxEnv(eval_env) expl_env = NormalizedBoxEnv(expl_env) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size print("obs_dim: ", obs_dim) print("action_dim: ", action_dim) qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) eval_path_collector = MdpPathCollector(eval_env, policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): base_expl_env = PointMassEnv(n=variant["num_tasks"], reward_type=variant["reward_type"]) expl_env = FlatGoalEnv(base_expl_env, append_goal_to_obs=True) base_eval_env = PointMassEnv(n=variant["num_tasks"], reward_type=variant["reward_type"]) eval_env = FlatGoalEnv(base_eval_env, append_goal_to_obs=True) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size print(expl_env.observation_space, expl_env.action_space) qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) eval_path_collector = MdpPathCollector( eval_env, policy, ) expl_path_collector = MdpPathCollector( expl_env, exploration_policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.train()
def experiment(variant): env_params = ENV_PARAMS[variant['env']] env_mod_params = variant['env_mod'] variant.update(env_params) expl_env = NormalizedBoxEnv(variant['env_class'](env_mod_params)) eval_env = NormalizedBoxEnv(variant['env_class']({})) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant[ 'num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant[ 'min_num_steps_before_training'], ) else: expl_path_collector = MdpPathCollector( expl_env, policy, ) algorithm = TorchBatchRLAlgorithmModEnv( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant[ 'num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant[ 'min_num_steps_before_training'], mod_env_epoch_schedule=variant['mod_env_epoch_schedule'], env_mod_dist=variant['mod_env_dist'], env_class=variant['env_class'], env_mod_params=variant['env_mod']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = make_env() eval_env = make_env() obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # from softlearning.environments.gym import register_image_reach # register_image_reach() # env = gym.envs.make( # 'Pusher2d-ImageReach-v0', # ) from softlearning.environments.gym.mujoco.image_pusher_2d import ( ImageForkReacher2dEnv) env_kwargs = { 'image_shape': (32, 32, 3), 'arm_goal_distance_cost_coeff': 0.0, 'arm_object_distance_cost_coeff': 1.0, 'goal': (0, -1), } eval_env = ImageForkReacher2dEnv(**env_kwargs) expl_env = ImageForkReacher2dEnv(**env_kwargs) input_width, input_height, input_channels = eval_env.image_shape image_dim = input_width * input_height * input_channels action_dim = int(np.prod(eval_env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=input_width, input_height=input_height, input_channels=input_channels, added_fc_input_size=4, output_conv_channels=True, output_size=None, ) non_image_dim = int(np.prod(eval_env.observation_space.shape)) - image_dim if variant['shared_qf_conv']: qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( Split(qf_cnn, identity, image_dim), FlattenEach(), ConcatTuple(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size + non_image_dim) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( Split(target_qf_cnn, identity, image_dim), FlattenEach(), ConcatTuple(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.conv_output_flat_size + non_image_dim) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) else: qf1_cnn = CNN(**cnn_params) cnn_output_dim = qf1_cnn.conv_output_flat_size qf1 = MlpQfWithObsProcessor(obs_processor=qf1_cnn, output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) qf2 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) target_qf1 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) target_qf2 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( Split(policy_cnn, identity, image_dim), FlattenEach(), ConcatTuple(), ) policy = TanhGaussianPolicyAdapter( policy_obs_processor, policy_cnn.conv_output_flat_size + non_image_dim, action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, **variant['eval_path_collector_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'batch': expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) elif variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): eval_env = gym.make(variant['env_name']) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp_Dropout( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) qf2 = FlattenMlp_Dropout( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) target_qf1 = FlattenMlp_Dropout( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) target_qf2 = FlattenMlp_Dropout( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[ M, M, ], ) vae_policy = VAEPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[750, 750], latent_dim=action_dim * 2, ) eval_path_collector = CustomMDPPathCollector(eval_env, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) buffer_filename = None if variant['buffer_filename'] is not None: buffer_filename = variant['buffer_filename'] replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) load_hdf5(eval_env.unwrapped.get_dataset(), replay_buffer, max_size=variant['replay_buffer_size']) trainer = UWACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, vae=vae_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, batch_rl=True, q_learning_alg=True, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): if variant.get("pretrained_algorithm_path", False): resume(variant) return if 'env' in variant: env_params = ENV_PARAMS[variant['env']] variant.update(env_params) if 'env_id' in env_params: if env_params['env_id'] in ['pen-v0', 'pen-sparse-v0', 'door-v0', 'relocate-v0', 'hammer-v0', 'pen-sparse-v0', 'door-sparse-v0', 'relocate-sparse-v0', 'hammer-sparse-v0']: import mj_envs expl_env = gym.make(env_params['env_id']) eval_env = gym.make(env_params['env_id']) else: expl_env = NormalizedBoxEnv(variant['env_class']()) eval_env = NormalizedBoxEnv(variant['env_class']()) if variant.get('sparse_reward', False): expl_env = RewardWrapperEnv(expl_env, compute_hand_sparse_reward) eval_env = RewardWrapperEnv(eval_env, compute_hand_sparse_reward) if variant.get('add_env_demos', False): variant["path_loader_kwargs"]["demo_paths"].append(variant["env_demo_path"]) if variant.get('add_env_offpolicy_data', False): variant["path_loader_kwargs"]["demo_paths"].append(variant["env_offpolicy_data_path"]) else: expl_env = encoder_wrapped_env(variant) eval_env = encoder_wrapped_env(variant) path_loader_kwargs = variant.get("path_loader_kwargs", {}) stack_obs = path_loader_kwargs.get("stack_obs", 1) if stack_obs > 1: expl_env = StackObservationEnv(expl_env, stack_obs=stack_obs) eval_env = StackObservationEnv(eval_env, stack_obs=stack_obs) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size if hasattr(expl_env, 'info_sizes'): env_info_sizes = expl_env.info_sizes else: env_info_sizes = dict() M = variant['layer_size'] vf_kwargs = variant.get("vf_kwargs", {}) vf1 = ConcatMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], **vf_kwargs ) target_vf1 = ConcatMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], **vf_kwargs ) policy_class = variant.get("policy_class", TanhGaussianPolicy) policy_kwargs = variant['policy_kwargs'] policy = policy_class( obs_dim=obs_dim, action_dim=action_dim, **policy_kwargs, ) target_policy = policy_class( obs_dim=obs_dim, action_dim=action_dim, **policy_kwargs, ) buffer_policy_class = variant.get("buffer_policy_class", policy_class) buffer_policy = buffer_policy_class( obs_dim=obs_dim, action_dim=action_dim, **variant.get("buffer_policy_kwargs", policy_kwargs), ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_policy = policy exploration_kwargs = variant.get('exploration_kwargs', {}) if exploration_kwargs: if exploration_kwargs.get("deterministic_exploration", False): expl_policy = MakeDeterministic(policy) exploration_strategy = exploration_kwargs.get("strategy", None) if exploration_strategy is None: pass elif exploration_strategy == 'ou': es = OUStrategy( action_space=expl_env.action_space, max_sigma=exploration_kwargs['noise'], min_sigma=exploration_kwargs['noise'], ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=expl_policy, ) elif exploration_strategy == 'gauss_eps': es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, max_sigma=exploration_kwargs['noise'], min_sigma=exploration_kwargs['noise'], # constant sigma epsilon=0, ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=expl_policy, ) else: error if variant.get('replay_buffer_class', EnvReplayBuffer) == AWREnvReplayBuffer: main_replay_buffer_kwargs = variant['replay_buffer_kwargs'] main_replay_buffer_kwargs['env'] = expl_env main_replay_buffer_kwargs['qf1'] = qf1 main_replay_buffer_kwargs['qf2'] = qf2 main_replay_buffer_kwargs['policy'] = policy else: main_replay_buffer_kwargs=dict( max_replay_buffer_size=variant['replay_buffer_size'], env=expl_env, ) replay_buffer_kwargs = dict( max_replay_buffer_size=variant['replay_buffer_size'], env=expl_env, ) replay_buffer = variant.get('replay_buffer_class', EnvReplayBuffer)( **main_replay_buffer_kwargs, ) if variant.get('use_validation_buffer', False): train_replay_buffer = replay_buffer validation_replay_buffer = variant.get('replay_buffer_class', EnvReplayBuffer)( **main_replay_buffer_kwargs, ) replay_buffer = SplitReplayBuffer(train_replay_buffer, validation_replay_buffer, 0.9) trainer_class = variant.get("trainer_class", QuinoaTrainer) trainer = trainer_class( env=eval_env, policy=policy, vf1=vf1, target_policy=target_policy, target_vf1=target_vf1, buffer_policy=buffer_policy, **variant['trainer_kwargs'] ) if variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant['min_num_steps_before_training'], ) else: expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant['min_num_steps_before_training'], ) algorithm.to(ptu.device) demo_train_buffer = EnvReplayBuffer( **replay_buffer_kwargs, ) demo_test_buffer = EnvReplayBuffer( **replay_buffer_kwargs, ) if variant.get("save_video", False): if variant.get("presampled_goals", None): variant['image_env_kwargs']['presampled_goals'] = load_local_or_remote_file(variant['presampled_goals']).item() image_eval_env = ImageEnv(GymToMultiEnv(eval_env), **variant["image_env_kwargs"]) image_eval_path_collector = ObsDictPathCollector( image_eval_env, eval_policy, observation_key="state_observation", ) image_expl_env = ImageEnv(GymToMultiEnv(expl_env), **variant["image_env_kwargs"]) image_expl_path_collector = ObsDictPathCollector( image_expl_env, expl_policy, observation_key="state_observation", ) video_func = VideoSaveFunction( image_eval_env, variant, image_expl_path_collector, image_eval_path_collector, ) algorithm.post_train_funcs.append(video_func) if variant.get('save_paths', False): algorithm.post_train_funcs.append(save_paths) if variant.get('load_demos', False): path_loader_class = variant.get('path_loader_class', MDPPathLoader) path_loader = path_loader_class(trainer, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, **path_loader_kwargs ) path_loader.load_demos() if variant.get('save_initial_buffers', False): buffers = dict( replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, ) buffer_path = osp.join(logger.get_snapshot_dir(), 'buffers.p') pickle.dump(buffers, open(buffer_path, "wb")) if variant.get('pretrain_policy', False): trainer.pretrain_policy_with_bc() if variant.get('pretrain_rl', False): trainer.pretrain_q_with_bc_data() if variant.get('save_pretrained_algorithm', False): p_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.p') pt_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.pt') data = algorithm._get_snapshot() data['algorithm'] = algorithm torch.save(data, open(pt_path, "wb")) torch.save(data, open(p_path, "wb")) if variant.get('train_rl', True): algorithm.train()
def offpolicy_main(variant): print("offpolicy main") if args.algo == 'sac': algo = "SAC" elif args.algo == 'td3': algo = "TD3" setup_logger('{0}_{1}'.format(args.env_name, args.save_name), variant=variant) ptu.set_gpu_mode(True) # optionally set the GPU (default=True) expl_env, eval_env, env_obj = prepare_env(args.env_name, args.visionmodel_path, **env_kwargs) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size expl_policy, eval_policy, trainer = prepare_trainer( algo, expl_env, obs_dim, action_dim, args.pretrained_policy_load, variant) if args.env_name.find('doorenv') > -1: expl_policy.knob_noisy = eval_policy.knob_noisy = args.knob_noisy expl_policy.nn = eval_policy.nn = env_obj.nn expl_policy.visionnet_input = eval_policy.visionnet_input = env_obj.visionnet_input if args.visionnet_input: visionmodel = load_visionmodel(expl_env._wrapped_env.xml_path, args.visionmodel_path, VisionModelXYZ()) visionmodel.to(ptu.device) expl_policy.visionmodel = visionmodel.eval() else: expl_policy.visionmodel = None eval_path_collector = MdpPathCollector( eval_env, eval_policy, doorenv=args.env_name.find('doorenv') > -1, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, doorenv=args.env_name.find('doorenv') > -1, ) if not args.replaybuffer_load: replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) else: replay_buffer = pickle.load(open(args.replaybuffer_load, "rb")) replay_buffer._env_info_keys = replay_buffer.env_info_sizes.keys() print("Loaded the replay buffer that has length of {}".format( replay_buffer.get_diagnostics())) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.save_interval = args.save_interval algorithm.save_dir = args.save_dir algorithm.algo = args.algo algorithm.env_name = args.env_name algorithm.save_name = args.save_name algorithm.env_kwargs = env_kwargs summary_name = args.log_dir + '{0}_{1}' writer = SummaryWriter(summary_name.format(args.env_name, args.save_name)) algorithm.writer = writer algorithm.to(ptu.device) algorithm.train()
def experiment(variant): if variant.get("pretrained_algorithm_path", False): resume(variant) return normalize_env = variant.get("normalize_env", True) env_id = variant.get("env_id", None) env_params = ENV_PARAMS.get(env_id, {}) variant.update(env_params) env_class = variant.get("env_class", None) env_kwargs = variant.get("env_kwargs", {}) expl_env = make(env_id, env_class, env_kwargs, normalize_env) eval_env = make(env_id, env_class, env_kwargs, normalize_env) if variant.get("add_env_demos", False): variant["path_loader_kwargs"]["demo_paths"].append( variant["env_demo_path"]) if variant.get("add_env_offpolicy_data", False): variant["path_loader_kwargs"]["demo_paths"].append( variant["env_offpolicy_data_path"]) path_loader_kwargs = variant.get("path_loader_kwargs", {}) stack_obs = path_loader_kwargs.get("stack_obs", 1) if stack_obs > 1: expl_env = StackObservationEnv(expl_env, stack_obs=stack_obs) eval_env = StackObservationEnv(eval_env, stack_obs=stack_obs) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size if hasattr(expl_env, "info_sizes"): env_info_sizes = expl_env.info_sizes else: env_info_sizes = dict() qf_kwargs = variant.get("qf_kwargs", {}) qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **qf_kwargs) qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **qf_kwargs) target_qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **qf_kwargs) target_qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **qf_kwargs) policy_class = variant.get("policy_class", TanhGaussianPolicy) policy_kwargs = variant["policy_kwargs"] policy_path = variant.get("policy_path", False) if policy_path: policy = load_local_or_remote_file(policy_path) else: policy = policy_class( obs_dim=obs_dim, action_dim=action_dim, **policy_kwargs, ) buffer_policy_path = variant.get("buffer_policy_path", False) if buffer_policy_path: buffer_policy = load_local_or_remote_file(buffer_policy_path) else: buffer_policy_class = variant.get("buffer_policy_class", policy_class) buffer_policy = buffer_policy_class( obs_dim=obs_dim, action_dim=action_dim, **variant.get("buffer_policy_kwargs", policy_kwargs), ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_policy = policy exploration_kwargs = variant.get("exploration_kwargs", {}) if exploration_kwargs: if exploration_kwargs.get("deterministic_exploration", False): expl_policy = MakeDeterministic(policy) exploration_strategy = exploration_kwargs.get("strategy", None) if exploration_strategy is None: pass elif exploration_strategy == "ou": es = OUStrategy( action_space=expl_env.action_space, max_sigma=exploration_kwargs["noise"], min_sigma=exploration_kwargs["noise"], ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=expl_policy, ) elif exploration_strategy == "gauss_eps": es = GaussianAndEpsilonStrategy( action_space=expl_env.action_space, max_sigma=exploration_kwargs["noise"], min_sigma=exploration_kwargs["noise"], # constant sigma epsilon=0, ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=expl_policy, ) else: error main_replay_buffer_kwargs = dict( max_replay_buffer_size=variant["replay_buffer_size"], env=expl_env, ) replay_buffer_kwargs = dict( max_replay_buffer_size=variant["replay_buffer_size"], env=expl_env, ) replay_buffer = variant.get("replay_buffer_class", EnvReplayBuffer)(**main_replay_buffer_kwargs, ) if variant.get("use_validation_buffer", False): train_replay_buffer = replay_buffer validation_replay_buffer = variant.get( "replay_buffer_class", EnvReplayBuffer)(**main_replay_buffer_kwargs, ) replay_buffer = SplitReplayBuffer(train_replay_buffer, validation_replay_buffer, 0.9) trainer_class = variant.get("trainer_class", AWACTrainer) trainer = trainer_class( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, buffer_policy=buffer_policy, **variant["trainer_kwargs"], ) if variant["collection_mode"] == "online": expl_path_collector = MdpStepCollector( expl_env, policy, ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant["max_path_length"], batch_size=variant["batch_size"], num_epochs=variant["num_epochs"], num_eval_steps_per_epoch=variant["num_eval_steps_per_epoch"], num_expl_steps_per_train_loop=variant[ "num_expl_steps_per_train_loop"], num_trains_per_train_loop=variant["num_trains_per_train_loop"], min_num_steps_before_training=variant[ "min_num_steps_before_training"], ) else: expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant["max_path_length"], batch_size=variant["batch_size"], num_epochs=variant["num_epochs"], num_eval_steps_per_epoch=variant["num_eval_steps_per_epoch"], num_expl_steps_per_train_loop=variant[ "num_expl_steps_per_train_loop"], num_trains_per_train_loop=variant["num_trains_per_train_loop"], min_num_steps_before_training=variant[ "min_num_steps_before_training"], ) algorithm.to(ptu.device) demo_train_buffer = EnvReplayBuffer(**replay_buffer_kwargs, ) demo_test_buffer = EnvReplayBuffer(**replay_buffer_kwargs, ) if variant.get("save_video", False): if variant.get("presampled_goals", None): variant["image_env_kwargs"][ "presampled_goals"] = load_local_or_remote_file( variant["presampled_goals"]).item() def get_img_env(env): renderer = EnvRenderer(**variant["renderer_kwargs"]) img_env = InsertImageEnv(GymToMultiEnv(env), renderer=renderer) image_eval_env = ImageEnv(GymToMultiEnv(eval_env), **variant["image_env_kwargs"]) # image_eval_env = get_img_env(eval_env) image_eval_path_collector = ObsDictPathCollector( image_eval_env, eval_policy, observation_key="state_observation", ) image_expl_env = ImageEnv(GymToMultiEnv(expl_env), **variant["image_env_kwargs"]) # image_expl_env = get_img_env(expl_env) image_expl_path_collector = ObsDictPathCollector( image_expl_env, expl_policy, observation_key="state_observation", ) video_func = VideoSaveFunction( image_eval_env, variant, image_expl_path_collector, image_eval_path_collector, ) algorithm.post_train_funcs.append(video_func) if variant.get("save_paths", False): algorithm.post_train_funcs.append(save_paths) if variant.get("load_demos", False): path_loader_class = variant.get("path_loader_class", MDPPathLoader) path_loader = path_loader_class( trainer, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, **path_loader_kwargs, ) path_loader.load_demos() if variant.get("load_env_dataset_demos", False): path_loader_class = variant.get("path_loader_class", HDF5PathLoader) path_loader = path_loader_class( trainer, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, **path_loader_kwargs, ) path_loader.load_demos(expl_env.get_dataset()) if variant.get("save_initial_buffers", False): buffers = dict( replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, ) buffer_path = osp.join(logger.get_snapshot_dir(), "buffers.p") pickle.dump(buffers, open(buffer_path, "wb")) if variant.get("pretrain_buffer_policy", False): trainer.pretrain_policy_with_bc( buffer_policy, replay_buffer.train_replay_buffer, replay_buffer.validation_replay_buffer, 10000, label="buffer", ) if variant.get("pretrain_policy", False): trainer.pretrain_policy_with_bc( policy, demo_train_buffer, demo_test_buffer, trainer.bc_num_pretrain_steps, ) if variant.get("pretrain_rl", False): trainer.pretrain_q_with_bc_data() if variant.get("save_pretrained_algorithm", False): p_path = osp.join(logger.get_snapshot_dir(), "pretrain_algorithm.p") pt_path = osp.join(logger.get_snapshot_dir(), "pretrain_algorithm.pt") data = algorithm._get_snapshot() data["algorithm"] = algorithm torch.save(data, open(pt_path, "wb")) torch.save(data, open(p_path, "wb")) if variant.get("train_rl", True): algorithm.train()
def experiment(variant): setup_logger("name-of-experiment", variant=variant) ptu.set_gpu_mode(True) expl_env = gym.make(variant["env_name"]) eval_env = gym.make(variant["env_name"]) obs_dim = expl_env.observation_space.image.shape[1] channels = expl_env.observation_space.image.shape[0] action_dim = SYMBOLIC_ACTION_COUNT symbolic_action_space = gym.spaces.Discrete(SYMBOLIC_ACTION_COUNT) symb_env = gym.make(variant["env_name"]) symb_env.action_space = symbolic_action_space qf = CNN( input_width=obs_dim, input_height=obs_dim, input_channels=channels, output_size=action_dim, kernel_sizes=[8, 4], n_channels=[16, 32], strides=[4, 2], paddings=[0, 0], hidden_sizes=[256], ) target_qf = CNN( input_width=obs_dim, input_height=obs_dim, input_channels=channels, output_size=action_dim, kernel_sizes=[8, 4], n_channels=[16, 32], strides=[4, 2], paddings=[0, 0], hidden_sizes=[256], ) qf_criterion = nn.MSELoss() eval_policy = LearnPlanPolicy(None) expl_policy = LearnPlanPolicy(None) eval_path_collector = MdpPathCollector(eval_env, eval_policy, rollout=hierarchical_rollout, render=variant["render"]) expl_path_collector = MdpPathCollector(expl_env, expl_policy, rollout=hierarchical_rollout, render=variant["render"]) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant["trainer_kwargs"]) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], symb_env) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def run_experiment_func(variant): env_params = ENV_PARAMS[variant['env']] variant.update(env_params) expl_env = NormalizedBoxEnv(variant['env_class']()) eval_env = NormalizedBoxEnv(variant['env_class']()) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) vf = ConcatMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, discount=variant['discount'], soft_target_tau=variant['soft_target_tau'], target_update_period=variant['target_update_period'], policy_lr=variant['policy_lr'], qf_lr=variant['qf_lr'], reward_scale=1, use_automatic_entropy_tuning=True, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant['min_num_steps_before_training'], ) return algorithm