def test_onehots_consistent_with_task_sampler(): # Import, construct environments here to avoid using up too much # resources if this test isn't run. # pylint: disable=import-outside-toplevel import metaworld mt10 = metaworld.MT10() env = MetaWorldSetTaskEnv(mt10, 'train', add_env_onehot=True) policy = RandomPolicy(env.action_space) workers = WorkerFactory(seed=100, max_episode_length=1, n_workers=10) sampler1 = LocalSampler.from_worker_factory(workers, policy, env) env_ups = [ SetTaskUpdate(MetaWorldSetTaskEnv, task, None) for task in env.sample_tasks(10) ] samples1 = sampler1.obtain_exact_episodes(1, policy, env_ups) task_sampler = MetaWorldTaskSampler(mt10, 'train', add_env_onehot=True) env_ups = task_sampler.sample(10) sampler2 = LocalSampler.from_worker_factory(workers, policy, env_ups) samples2 = sampler2.obtain_exact_episodes(1, policy, env_ups) name_to_obs1 = {} for obs1, name1 in zip(samples1.observations, samples1.env_infos['task_name']): name_to_obs1[name1] = obs1 for obs2, name2 in zip(samples2.observations, samples2.env_infos['task_name']): assert (name_to_obs1[name2][-10:] == obs2[-10:]).all()
def mttrpo_metaworld_mt50(ctxt, seed, epochs, batch_size, n_workers, n_tasks): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. n_workers (int): The number of workers the sampler should use. n_tasks (int): Number of tasks to use. Should be a multiple of 50. """ set_seed(seed) mt10 = metaworld.MT10() train_task_sampler = MetaWorldTaskSampler(mt10, 'train', lambda env, _: normalize(env), add_env_onehot=True) assert n_tasks % 10 == 0 assert n_tasks <= 500 envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, n_workers=n_workers) algo = TRPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, discount=0.99, gae_lambda=0.95) trainer = Trainer(ctxt) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=batch_size)
def mtppo_metaworld_mt1_push(ctxt, seed, epochs, batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. """ set_seed(seed) n_tasks = 50 mt1 = metaworld.MT1('push-v1') train_task_sampler = MetaWorldTaskSampler(mt1, 'train', lambda env, _: normalize(env)) envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = GaussianMLPPolicy( env_spec=env.spec, hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_nonlinearity=torch.tanh, output_nonlinearity=None) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length) algo = PPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2) trainer = Trainer(ctxt) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=batch_size)
def te_ppo_mt1_push(ctxt, seed, n_epochs, batch_size_per_task): """Train Task Embedding PPO with PointEnv. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. n_epochs (int): Total number of epochs for training. batch_size_per_task (int): Batch size of samples for each task. """ set_seed(seed) n_tasks = 50 mt1 = metaworld.MT1('push-v1') task_sampler = MetaWorldTaskSampler(mt1, 'train', lambda env, _: normalize(env), add_env_onehot=False) envs = [env_up() for env_up in task_sampler.sample(n_tasks)] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') latent_length = 2 inference_window = 6 batch_size = batch_size_per_task * n_tasks policy_ent_coeff = 2e-2 encoder_ent_coeff = 2e-4 inference_ce_coeff = 5e-2 embedding_init_std = 0.1 embedding_max_std = 0.2 embedding_min_std = 1e-6 policy_init_std = 1.0 policy_max_std = None policy_min_std = None with TFTrainer(snapshot_config=ctxt) as trainer: task_embed_spec = TEPPO.get_encoder_spec(env.task_space, latent_dim=latent_length) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=(20, 20), std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) traj_embed_spec = TEPPO.get_infer_spec( env.spec, latent_dim=latent_length, inference_window_size=inference_window) inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=(20, 10), std_share_network=True, init_std=2.0, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=(32, 16), std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents']) algo = TEPPO(env_spec=env.spec, policy=policy, baseline=baseline, inference=inference, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=policy_ent_coeff, encoder_ent_coeff=encoder_ent_coeff, inference_ce_coeff=inference_ce_coeff, use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), inference_optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), center_adv=True, stop_ce_gradient=True) trainer.setup(algo, env, sampler_cls=LocalSampler, sampler_args=None, worker_class=TaskEmbeddingWorker) trainer.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)
def mtsac_metaworld_mt10(ctxt=None, *, seed, _gpu, n_tasks, timesteps): """Train MTSAC with MT10 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). n_tasks (int): Number of tasks to use. Should be a multiple of 10. timesteps (int): Number of timesteps to run. """ deterministic.set_seed(seed) trainer = Trainer(ctxt) mt10 = metaworld.MT10() # pylint: disable=no-member mt10_test = metaworld.MT10() # pylint: disable=no-member # pylint: disable=missing-return-doc, missing-return-type-doc def wrap(env, _): return normalize(env, normalize_reward=True) train_task_sampler = MetaWorldTaskSampler(mt10, 'train', wrap, add_env_onehot=True) test_task_sampler = MetaWorldTaskSampler(mt10_test, 'train', add_env_onehot=True) assert n_tasks % 10 == 0 assert n_tasks <= 500 mt10_train_envs = train_task_sampler.sample(n_tasks) env = mt10_train_envs[0]() mt10_test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)] policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) meta_batch_size = 10 sampler = LocalSampler( agents=policy, envs=mt10_train_envs, max_episode_length=env.spec.max_episode_length, # 1 sampler worker for each environment n_workers=meta_batch_size, worker_class=FragmentWorker, # increasing n_envs increases the vectorization of a sampler worker # which improves runtime performance, but you will need to adjust this # depending on your memory constraints. For reference, each worker by # default uses n_envs=8. Each environment is approximately ~50mb large # so creating 50 envs with 8 copies comes out to 20gb of memory. Many # users want to be able to run multiple seeds on 1 machine, so I have # reduced this to n_envs = 2 for 2 copies in the meantime. worker_args=dict(n_envs=2)) batch_size = int(env.spec.max_episode_length * meta_batch_size) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=env.spec.max_episode_length, eval_env=mt10_test_envs, env_spec=env.spec, num_tasks=10, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) mtsac.to() trainer.setup(algo=mtsac, env=mt10_train_envs) trainer.train(n_epochs=epochs, batch_size=batch_size)
def __init__( self, experiment_name, use_gpu, trainer_args ): """Train MTSAC with metaworld_experiments environment. Args: experiment_name: expeirment name to be used for logging and checkpointing use_wandb: boolean, defines whether or not to log to wandb use_gpu: boolean, defines whether or not to use to GPU for training trainer_args: named tuple with args given by config """ # Define log and checkpoint dir self.checkpoint_dir = os.path.join( trainer_args.log_dir, f"{experiment_name}-{trainer_args.project_id}" ) print(f"Checkpoint dir: {self.checkpoint_dir}") self.state_path = os.path.join(self.checkpoint_dir, "experiment_state.p") self.env_state_path = os.path.join(self.checkpoint_dir, "env_state.p") self.config_path = os.path.join(self.checkpoint_dir, "config.json") self.experiment_name = experiment_name # Only define viz_save_path if required to save visualizations local self.viz_save_path = None if trainer_args.save_visualizations_local: self.viz_save_path = os.path.join(self.checkpoint_dir, "viz") # Check if loading from existing experiment self.loading_from_existing = os.path.exists(self.checkpoint_dir) os.makedirs(self.checkpoint_dir, exist_ok=True) # Save arguments for later retrieval self.init_config(trainer_args) num_tasks = trainer_args.num_tasks # TODO: do we have to fix which GPU to use? run distributed across multiGPUs if use_gpu: set_gpu_mode(True, 0) if trainer_args.seed is not None: deterministic.set_seed(trainer_args.seed) # Note: different classes whether it uses 10 or 50 tasks. Why? mt_env = ( metaworld.MT10(seed=trainer_args.env_seed) if num_tasks <= 10 else metaworld.MT50(seed=trainer_args.env_seed) ) train_task_sampler = MetaWorldTaskSampler( mt_env, "train", add_env_onehot=True ) # TODO: add some clarifying comments of why these asserts are required assert num_tasks % 10 == 0, "Number of tasks have to divisible by 10" assert num_tasks <= 500, "Number of tasks should be less or equal 500" # TODO: do we have guarantees that in case seed is set, the tasks being sampled # are the same? mt_train_envs = train_task_sampler.sample(num_tasks) env = mt_train_envs[0]() if trainer_args.params_seed is not None: torch.manual_seed(trainer_args.params_seed) policy = create_policy_net(env_spec=env.spec, net_params=trainer_args) qf1 = create_qf_net(env_spec=env.spec, net_params=trainer_args) qf2 = create_qf_net(env_spec=env.spec, net_params=trainer_args) if trainer_args.params_seed is not None: calculate_mean_param("policy", policy) calculate_mean_param("qf1", qf1) calculate_mean_param("qf2", qf2) if trainer_args.override_weight_initialization: logging.warn("Overriding dendritic layer weight initialization") self.override_weight_initialization([policy, qf1, qf2]) replay_buffer = PathBuffer( capacity_in_transitions=trainer_args.num_buffer_transitions ) max_episode_length = env.spec.max_episode_length self.env_steps_per_epoch = int(max_episode_length * num_tasks) self.num_epochs = trainer_args.timesteps // self.env_steps_per_epoch sampler = RaySampler( agent=policy, envs=mt_train_envs, max_episode_length=max_episode_length, cpus_per_worker=trainer_args.cpus_per_worker, gpus_per_worker=trainer_args.gpus_per_worker, workers_per_env=trainer_args.workers_per_env, seed=trainer_args.seed, ) self._algo = CustomMTSAC( env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, sampler=sampler, train_task_sampler=train_task_sampler, gradient_steps_per_itr=int( max_episode_length * trainer_args.num_grad_steps_scale ), task_update_frequency=trainer_args.task_update_frequency, num_tasks=num_tasks, min_buffer_size=max_episode_length * num_tasks, target_update_tau=trainer_args.target_update_tau, discount=trainer_args.discount, buffer_batch_size=trainer_args.buffer_batch_size, policy_lr=trainer_args.policy_lr, qf_lr=trainer_args.qf_lr, reward_scale=trainer_args.reward_scale, num_evaluation_episodes=trainer_args.eval_episodes, fp16=trainer_args.fp16 if use_gpu else False, log_per_task=trainer_args.log_per_task, share_train_eval_env=trainer_args.share_train_eval_env ) # Override with loaded networks if existing experiment self.current_epoch = 0 if self.loading_from_existing: self.load_experiment_state() # Move all networks within the model on device self._algo.to()
def mtsac_metaworld_mt50( ctxt=None, *, config_pth, seed, timesteps, use_wandb, wandb_project_name, gpu ): """Train MTSAC with MT50 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). num_tasks (int): Number of tasks to use. Should be a multiple of 10. timesteps (int): Number of timesteps to run. """ """Train MTSAC with metaworld_experiments environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). timesteps (int): Number of timesteps to run. """ print(f"Initiation took {time() - t0:.2f} secs") # Get experiment parameters (e.g. hyperparameters) and save the json file params = get_params(config_pth) with open(ctxt.snapshot_dir + "/params.json", "w") as json_file: json.dump(params, json_file) if use_wandb == "True": use_wandb = True wandb.init( name=params["experiment_name"], project=wandb_project_name, group="Baselines{}".format("mt50"), reinit=True, config=params, ) else: use_wandb = False num_tasks = 50 timesteps = timesteps deterministic.set_seed(seed) trainer = CustomTrainer(ctxt) mt10 = metaworld.MT50() train_task_sampler = MetaWorldTaskSampler(mt10, "train", add_env_onehot=True) assert num_tasks % 10 == 0, "Number of tasks have to divisible by 10" assert num_tasks <= 500, "Number of tasks should be less or equal 500" mt50_train_envs = train_task_sampler.sample(num_tasks) env = mt50_train_envs[0]() params["net"]["policy_min_std"] = np.exp(params["net"]["policy_min_log_std"]) params["net"]["policy_max_std"] = np.exp(params["net"]["policy_max_log_std"]) policy = create_policy_net(env_spec=env.spec, net_params=params["net"]) qf1 = create_qf_net(env_spec=env.spec, net_params=params["net"]) qf2 = create_qf_net(env_spec=env.spec, net_params=params["net"]) replay_buffer = PathBuffer( capacity_in_transitions=int(params["general_setting"]["num_buffer_transitions"]) ) max_episode_length = env.spec.max_episode_length # Note: are the episode length the same among all tasks? sampler = RaySampler( agents=policy, envs=mt50_train_envs, max_episode_length=max_episode_length, # 1 sampler worker for each environment n_workers=num_tasks, worker_class=DefaultWorker ) test_sampler = RaySampler( agents=policy, envs=mt50_train_envs, max_episode_length=max_episode_length, # 1 sampler worker for each environment n_workers=num_tasks, worker_class=EvalWorker ) # Number of transitions before a set of gradient updates steps_between_updates = int(max_episode_length * num_tasks) # epoch: 1 cycle of data collection + gradient updates epochs = timesteps // steps_between_updates mtsac = CustomMTSAC( env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, sampler=sampler, train_task_sampler=train_task_sampler, test_sampler=test_sampler, gradient_steps_per_itr=int(max_episode_length * params["training"]["num_grad_steps_scale"]), num_tasks=num_tasks, min_buffer_size=max_episode_length * num_tasks, target_update_tau=params["training"]["target_update_tau"], discount=params["general_setting"]["discount"], buffer_batch_size=params["training"]["buffer_batch_size"], policy_lr=params["training"]["policy_lr"], qf_lr=params["training"]["qf_lr"], reward_scale=params["training"]["reward_scale"], num_evaluation_episodes=params["general_setting"]["eval_episodes"], task_update_frequency=params["training"]["task_update_frequency"], wandb_logging=use_wandb, evaluation_frequency=params["general_setting"]["evaluation_frequency"] ) if gpu is not None: set_gpu_mode(True, gpu) mtsac.to() trainer.setup(algo=mtsac, env=mt50_train_envs) trainer.train(n_epochs=epochs, batch_size=steps_between_updates)
def mtsac_metaworld_mt10(ctxt=None, *, seed, _gpu, n_tasks, timesteps): """Train MTSAC with MT10 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). n_tasks (int): Number of tasks to use. Should be a multiple of 10. timesteps (int): Number of timesteps to run. """ deterministic.set_seed(seed) trainer = Trainer(ctxt) mt10 = metaworld.MT10() # pylint: disable=no-member mt10_test = metaworld.MT10() # pylint: disable=no-member # pylint: disable=missing-return-doc, missing-return-type-doc def wrap(env, _): return normalize(env, normalize_reward=True) train_task_sampler = MetaWorldTaskSampler(mt10, 'train', wrap, add_env_onehot=True) test_task_sampler = MetaWorldTaskSampler(mt10_test, 'train', add_env_onehot=True) assert n_tasks % 10 == 0 assert n_tasks <= 500 mt10_train_envs = train_task_sampler.sample(n_tasks) env = mt10_train_envs[0]() mt10_test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)] policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) meta_batch_size = 10 # ray.init(local_mode=True, log_to_driver=False, ignore_reinit_error=True) sampler = SingleVecWorkSampler( agents=policy, envs=mt10_train_envs, n_workers=meta_batch_size, max_episode_length=env.spec.max_episode_length, # 1 sampler worker for each environment worker_class=FragmentWorker, # increasing n_envs increases the vectorization of a sampler worker # which improves runtime performance, but you will need to adjust this # depending on your memory constraints. For reference, each worker by # default uses n_envs=8. Each environment is approximately ~50mb large # so creating 50 envs with 8 copies comes out to 20gb of memory. Many # users want to be able to run multiple seeds on 1 machine, so I have # reduced this to n_envs = 2 for 2 copies in the meantime. worker_args=dict(n_envs=10)) # one episode for each task between gradient steps batch_size = int(env.spec.max_episode_length * meta_batch_size) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=env.spec.max_episode_length, eval_env=mt10_test_envs, env_spec=env.spec, num_tasks=10, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) trainer.setup(algo=mtsac, env=mt10_train_envs) import time s = time.time() mtsac.to() print(time.time() - s) s = time.time() # trainer.step_episode = trainer.obtain_samples(0, 1500, None, None) trainer.step_episode = trainer.obtain_samples(0, 2000, None, None) print((time.time() - s)) a = 2 from garage import StepType path_returns = [] for path in trainer.step_episode: mtsac.replay_buffer.add_path( dict(observation=path['observations'], action=path['actions'], reward=path['rewards'].reshape(-1, 1), next_observation=path['next_observations'], terminal=np.array([ step_type == StepType.TERMINAL for step_type in path['step_types'] ]).reshape(-1, 1))) path_returns.append(sum(path['rewards'])) s = time.time() for _ in range(10): trainer._algo.train_once() print((time.time() - s) / 10)
def mtsac_metaworld_mt10(ctxt=None, *, experiment_name, config_pth, seed, use_wandb, gpu): """Train MTSAC with MT10 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). timesteps (int): Number of timesteps to run. """ print(f"Initiation took {time()-t0:.2f} secs") device = torch.device("cuda") if gpu else torch.device("cpu") print(f"Using GPU: {gpu}, Device: {device}") # maybe overring other things - this is required, why? if gpu: set_gpu_mode(True) else: set_gpu_mode(False) # Get experiment parameters (e.g. hyperparameters) and save the json file params = get_params(config_pth) with open(ctxt.snapshot_dir + "/params.json", "w") as json_file: json.dump(params, json_file) if use_wandb == "True": use_wandb = True wandb.init( name=experiment_name, project="mt10_debug", group="Baselines{}".format("mt10"), reinit=True, config=params, ) else: use_wandb = False num_tasks = params["net"]["num_tasks"] timesteps = 15000000 deterministic.set_seed(seed) trainer = Trainer(ctxt) # Note: different classes whether it uses 10 or 50 tasks. Why? if num_tasks <= 10: mt_env = metaworld.MT10() else: mt_env = metaworld.MT50() train_task_sampler = MetaWorldTaskSampler(mt_env, "train", add_env_onehot=True) assert num_tasks % 10 == 0, "Number of tasks have to divisible by 10" assert num_tasks <= 500, "Number of tasks should be less or equal 500" mt_train_envs = train_task_sampler.sample(num_tasks) env = mt_train_envs[0]() params["net"]["policy_min_std"] = np.exp( params["net"]["policy_min_log_std"]) params["net"]["policy_max_std"] = np.exp( params["net"]["policy_max_log_std"]) policy = create_policy_net(env_spec=env.spec, net_params=params["net"]) print("Created policy") qf1 = create_qf_net(env_spec=env.spec, net_params=params["net"]) qf2 = create_qf_net(env_spec=env.spec, net_params=params["net"]) print("Created value functions") replay_buffer = PathBuffer(capacity_in_transitions=int( params["general_setting"]["num_buffer_transitions"])) max_episode_length = env.spec.max_episode_length # Note: are the episode length the same among all tasks? sampler = RaySampler( agents=policy, envs=mt_train_envs, max_episode_length=max_episode_length, cpus_per_worker=params["sampler"]["cpus_per_worker"], gpus_per_worker=params["sampler"]["gpus_per_worker"], seed=None, # set to get_seed() to make it deterministic ) # will probably still need the sampler test_sampler = sampler # test_sampler = RaySampler( # agents=policy, # envs=mt_train_envs, # max_episode_length=max_episode_length, # # 1 sampler worker for each environment # n_workers=num_tasks, # worker_class=EvalWorker # ) # Note: difference between sampler and test sampler is only the worker # difference is one line in EvalWorker, uses average: a = agent_info["mean"] # can we create a unified worker that cointais both rules? # Number of transitions before a set of gradient updates # Note: should we use avg episode length, if they are not same for all tasks? batch_size = int(max_episode_length * num_tasks) # TODO: this whole block seems unnecessary, it is not doing anything. # Number of times policy is evaluated (also the # of epochs) num_evaluation_points = timesteps // batch_size epochs = timesteps // batch_size # number of times new batch of samples + gradient updates are done per epoch epoch_cycles = epochs // num_evaluation_points # this will always be equal to 1 epochs = epochs // epoch_cycles mtsac = CustomMTSAC( env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, sampler=sampler, train_task_sampler=train_task_sampler, test_sampler=test_sampler, gradient_steps_per_itr=1, num_tasks=num_tasks, steps_per_epoch=epoch_cycles, min_buffer_size=max_episode_length * num_tasks, target_update_tau=params["training"]["target_update_tau"], discount=params["general_setting"]["discount"], buffer_batch_size=params["training"]["buffer_batch_size"], policy_lr=params["training"]["policy_lr"], qf_lr=params["training"]["qf_lr"], reward_scale=params["training"]["reward_scale"], num_evaluation_episodes=params["general_setting"]["eval_episodes"], task_update_frequency=params["training"]["task_update_frequency"], wandb_logging=use_wandb, evaluation_frequency=params["general_setting"]["evaluation_frequency"], ) print("Created algo") mtsac.to(device=device) print("Moved networks to device") trainer.setup(algo=mtsac, env=mt_train_envs) print("Setup trainer") trainer.train(n_epochs=epochs, batch_size=batch_size)
def mtsac_metaworld_mt10(ctxt=None, *, seed, _gpu, n_tasks, timesteps): """Train MTSAC with MT10 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). n_tasks (int): Number of tasks to use. Should be a multiple of 10. timesteps (int): Number of timesteps to run. """ deterministic.set_seed(seed) trainer = Trainer(ctxt) mt10 = metaworld.MT10() mt10_test = metaworld.MT10() # pylint: disable=missing-return-doc, missing-return-type-doc def wrap(env, _): return normalize(env) train_task_sampler = MetaWorldTaskSampler(mt10, 'train', wrap, add_env_onehot=True) test_task_sampler = MetaWorldTaskSampler(mt10_test, 'train', wrap, add_env_onehot=True) assert n_tasks % 10 == 0 assert n_tasks <= 500 mt10_train_envs = train_task_sampler.sample(n_tasks) env = mt10_train_envs[0]() mt10_test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)] policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) meta_batch_size = 10 batch_size = int(env.spec.max_episode_length * meta_batch_size) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, eval_env=mt10_test_envs, env_spec=env.spec, num_tasks=10, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) mtsac.to() trainer.setup(algo=mtsac, env=mt10_train_envs, sampler_cls=LocalSampler, n_workers=meta_batch_size) trainer.train(n_epochs=epochs, batch_size=batch_size)