def experiment(variant): from rlkit.envs.gym_minigrid.gym_minigrid import envs expl_env = ToolsEnv(**variant['env_kwargs']) eval_env = ToolsEnv(**variant['env_kwargs']) rollout_env = ToolsEnv(**variant['env_kwargs']) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.n layer_size = variant['algo_kwargs']['layer_size'] lifetime = variant['env_kwargs'].get('time_horizon', 0) == 0 if lifetime: assert eval_env.time_horizon == 0, 'cannot have time horizon for lifetime env' qf = gen_network(variant['algo_kwargs'], action_dim, layer_size) target_qf = gen_network(variant['algo_kwargs'], action_dim, layer_size) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) # eval_policy = SoftmaxQPolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedyDecay(expl_env.action_space, 1e-5, 1, 0.1), eval_policy, ) if lifetime: eval_policy = expl_policy # expl_policy = PolicyWrappedWithExplorationStrategy( # EpsilonGreedy(expl_env.action_space, 0.5), # eval_policy, # ) if eval_env.time_horizon == 0: collector_class = LifetimeMdpPathCollector if lifetime else MdpPathCollector else: collector_class = MdpPathCollectorConfig eval_path_collector = collector_class( eval_env, eval_policy, # render=True ) expl_path_collector = collector_class(expl_env, expl_policy) trainer = DoubleDQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['algo_kwargs']['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['algo_kwargs']['replay_buffer_size'], expl_env) #algo_class = TorchLifetimeRLAlgorithm if lifetime else TorchBatchRLAlgorithm algo_class = TorchHumanInputLifetimeRLAlgorithm algorithm = algo_class(trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, rollout_env=rollout_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def gen_validation_envs(n, filename, **kwargs): envs = [] seeds = np.random.randint(0, 100000, n).tolist() for idx in range(n): env_kwargs = dict( grid_size=8, # start agent at random pos agent_start_pos=None, health_cap=1000, gen_resources=True, fully_observed=False, task='make axe', make_rtype='sparse', fixed_reset=False, only_partial_obs=True, init_resources={ 'metal': 3, 'wood': 3 }, resource_prob={ 'metal': 0.02, 'wood': 0.02 }, fixed_expected_resources=True, end_on_task_completion=True, time_horizon=200, seed=seeds[idx]) env_kwargs.update(**kwargs) env = ToolsEnv(**env_kwargs) envs.append(env) pickle.dump({'envs': envs, 'seeds': seeds}, open(filename, 'wb')) print('Generated %d envs at file: %s' % (n, filename))
def gen_validation_envs(n, filename, **kwargs): envs = [] seeds = np.random.randint(0, 100000, n).tolist() for idx in range(n): env_kwargs = { "agent_start_pos": None, "end_on_task_completion": True, "fixed_expected_resources": True, "fixed_reset": False, "fully_observed": False, "gen_resources": False, "grid_size": 8, "health_cap": 1000, "init_resources": { "metal": 2, "wood": 2 }, "make_rtype": "sparse", "only_partial_obs": True, "resource_prob": { "metal": 0.0, "wood": 0.0 }, "task": "make axe", "time_horizon": 200 } env_kwargs = dict( grid_size=8, # start agent at random pos agent_start_pos=None, health_cap=1000, gen_resources=False, fully_observed=False, task='make axe', make_rtype='sparse', fixed_reset=False, only_partial_obs=True, init_resources={ 'metal': 1, 'wood': 1 }, resource_prob={ 'metal': 0.0, 'wood': 0.0 }, fixed_expected_resources=True, end_on_task_completion=True, time_horizon=100, seed=seeds[idx]) env_kwargs.update(**kwargs) env = ToolsEnv(**env_kwargs) envs.append(env) pickle.dump({'envs': envs, 'seeds': seeds}, open(filename, 'wb')) json.dump(env_kwargs, open(filename.strip('.pkl') + '.json', 'w'), indent=4, sort_keys=True) print('Generated %d envs at file: %s' % (n, filename))