def __init__( self, experiment_name, use_gpu, trainer_args ): """Train MTSAC with metaworld_experiments environment. Args: experiment_name: expeirment name to be used for logging and checkpointing use_wandb: boolean, defines whether or not to log to wandb use_gpu: boolean, defines whether or not to use to GPU for training trainer_args: named tuple with args given by config """ # Define log and checkpoint dir self.checkpoint_dir = os.path.join( trainer_args.log_dir, f"{experiment_name}-{trainer_args.project_id}" ) print(f"Checkpoint dir: {self.checkpoint_dir}") self.state_path = os.path.join(self.checkpoint_dir, "experiment_state.p") self.env_state_path = os.path.join(self.checkpoint_dir, "env_state.p") self.config_path = os.path.join(self.checkpoint_dir, "config.json") self.experiment_name = experiment_name # Only define viz_save_path if required to save visualizations local self.viz_save_path = None if trainer_args.save_visualizations_local: self.viz_save_path = os.path.join(self.checkpoint_dir, "viz") # Check if loading from existing experiment self.loading_from_existing = os.path.exists(self.checkpoint_dir) os.makedirs(self.checkpoint_dir, exist_ok=True) # Save arguments for later retrieval self.init_config(trainer_args) num_tasks = trainer_args.num_tasks # TODO: do we have to fix which GPU to use? run distributed across multiGPUs if use_gpu: set_gpu_mode(True, 0) if trainer_args.seed is not None: deterministic.set_seed(trainer_args.seed) # Note: different classes whether it uses 10 or 50 tasks. Why? mt_env = ( metaworld.MT10(seed=trainer_args.env_seed) if num_tasks <= 10 else metaworld.MT50(seed=trainer_args.env_seed) ) train_task_sampler = MetaWorldTaskSampler( mt_env, "train", add_env_onehot=True ) # TODO: add some clarifying comments of why these asserts are required assert num_tasks % 10 == 0, "Number of tasks have to divisible by 10" assert num_tasks <= 500, "Number of tasks should be less or equal 500" # TODO: do we have guarantees that in case seed is set, the tasks being sampled # are the same? mt_train_envs = train_task_sampler.sample(num_tasks) env = mt_train_envs[0]() if trainer_args.params_seed is not None: torch.manual_seed(trainer_args.params_seed) policy = create_policy_net(env_spec=env.spec, net_params=trainer_args) qf1 = create_qf_net(env_spec=env.spec, net_params=trainer_args) qf2 = create_qf_net(env_spec=env.spec, net_params=trainer_args) if trainer_args.params_seed is not None: calculate_mean_param("policy", policy) calculate_mean_param("qf1", qf1) calculate_mean_param("qf2", qf2) if trainer_args.override_weight_initialization: logging.warn("Overriding dendritic layer weight initialization") self.override_weight_initialization([policy, qf1, qf2]) replay_buffer = PathBuffer( capacity_in_transitions=trainer_args.num_buffer_transitions ) max_episode_length = env.spec.max_episode_length self.env_steps_per_epoch = int(max_episode_length * num_tasks) self.num_epochs = trainer_args.timesteps // self.env_steps_per_epoch sampler = RaySampler( agent=policy, envs=mt_train_envs, max_episode_length=max_episode_length, cpus_per_worker=trainer_args.cpus_per_worker, gpus_per_worker=trainer_args.gpus_per_worker, workers_per_env=trainer_args.workers_per_env, seed=trainer_args.seed, ) self._algo = CustomMTSAC( env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, sampler=sampler, train_task_sampler=train_task_sampler, gradient_steps_per_itr=int( max_episode_length * trainer_args.num_grad_steps_scale ), task_update_frequency=trainer_args.task_update_frequency, num_tasks=num_tasks, min_buffer_size=max_episode_length * num_tasks, target_update_tau=trainer_args.target_update_tau, discount=trainer_args.discount, buffer_batch_size=trainer_args.buffer_batch_size, policy_lr=trainer_args.policy_lr, qf_lr=trainer_args.qf_lr, reward_scale=trainer_args.reward_scale, num_evaluation_episodes=trainer_args.eval_episodes, fp16=trainer_args.fp16 if use_gpu else False, log_per_task=trainer_args.log_per_task, share_train_eval_env=trainer_args.share_train_eval_env ) # Override with loaded networks if existing experiment self.current_epoch = 0 if self.loading_from_existing: self.load_experiment_state() # Move all networks within the model on device self._algo.to()
def mtppo_metaworld_mt10(ctxt, experiment_name, config_pth, seed, n_workers, n_tasks, use_wandb, wandb_username, use_gpu): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. n_workers (int): The number of workers the sampler should use. n_tasks (int): Number of tasks to use. Should be a multiple of 10. """ params = get_params(config_pth) set_seed(seed) mt10 = metaworld.MT10() train_task_sampler = MetaWorldTaskSampler(mt10, 'train', lambda env, _: normalize(env), add_env_onehot=True) if use_wandb == 'True': use_wandb = True wandb.init( name=experiment_name, entity=wandb_username, project="mt10", group="Baselines{}".format("mt10"), reinit=True, config=params, ) else: use_wandb = False assert n_tasks % 10 == 0 assert n_tasks <= 500 envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)] env = envs[0] policy = create_policy_net(env_spec=env.spec, net_params=params["net"] ) value_function = create_vf_net(env_spec=env.spec, net_params=params["net"] ) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, n_workers=n_workers, worker_class=DefaultWorker) gpu_training = True if use_gpu else False algo = CustomMTPPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, train_task_sampler=train_task_sampler, num_tasks=n_tasks, task_update_frequency=params["training"]["task_update_frequency"], num_eval_eps=params["general_setting"]["eval_episodes"], policy_lr=params["training"]["policy_lr"], vf_lr=params["training"]["vf_lr"], ppo_eps=params["training"]["ppo_eps"], minibatch_size=params["training"]["minibatch_size"], ppo_epochs=params["training"]["ppo_epochs"], num_train_per_epoch=params["training"]["num_train_per_epoch"], discount=params["general_setting"]["discount"], gae_lambda=params["training"]["gae_lambda"], center_adv=False, wandb_logging=use_wandb, eval_freq=params["general_setting"]["eval_freq"], stop_entropy_gradient=True, entropy_method='max', gpu_training=gpu_training ) trainer = Trainer(ctxt) trainer.setup(algo, env) trainer.train(n_epochs=params["training"]["epochs"], batch_size=params["training"]["batch_episodes_per_task"], plot=False)
def mtsac_metaworld_mt50( ctxt=None, *, config_pth, seed, timesteps, use_wandb, wandb_project_name, gpu ): """Train MTSAC with MT50 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). num_tasks (int): Number of tasks to use. Should be a multiple of 10. timesteps (int): Number of timesteps to run. """ """Train MTSAC with metaworld_experiments environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). timesteps (int): Number of timesteps to run. """ print(f"Initiation took {time() - t0:.2f} secs") # Get experiment parameters (e.g. hyperparameters) and save the json file params = get_params(config_pth) with open(ctxt.snapshot_dir + "/params.json", "w") as json_file: json.dump(params, json_file) if use_wandb == "True": use_wandb = True wandb.init( name=params["experiment_name"], project=wandb_project_name, group="Baselines{}".format("mt50"), reinit=True, config=params, ) else: use_wandb = False num_tasks = 50 timesteps = timesteps deterministic.set_seed(seed) trainer = CustomTrainer(ctxt) mt10 = metaworld.MT50() train_task_sampler = MetaWorldTaskSampler(mt10, "train", add_env_onehot=True) assert num_tasks % 10 == 0, "Number of tasks have to divisible by 10" assert num_tasks <= 500, "Number of tasks should be less or equal 500" mt50_train_envs = train_task_sampler.sample(num_tasks) env = mt50_train_envs[0]() params["net"]["policy_min_std"] = np.exp(params["net"]["policy_min_log_std"]) params["net"]["policy_max_std"] = np.exp(params["net"]["policy_max_log_std"]) policy = create_policy_net(env_spec=env.spec, net_params=params["net"]) qf1 = create_qf_net(env_spec=env.spec, net_params=params["net"]) qf2 = create_qf_net(env_spec=env.spec, net_params=params["net"]) replay_buffer = PathBuffer( capacity_in_transitions=int(params["general_setting"]["num_buffer_transitions"]) ) max_episode_length = env.spec.max_episode_length # Note: are the episode length the same among all tasks? sampler = RaySampler( agents=policy, envs=mt50_train_envs, max_episode_length=max_episode_length, # 1 sampler worker for each environment n_workers=num_tasks, worker_class=DefaultWorker ) test_sampler = RaySampler( agents=policy, envs=mt50_train_envs, max_episode_length=max_episode_length, # 1 sampler worker for each environment n_workers=num_tasks, worker_class=EvalWorker ) # Number of transitions before a set of gradient updates steps_between_updates = int(max_episode_length * num_tasks) # epoch: 1 cycle of data collection + gradient updates epochs = timesteps // steps_between_updates mtsac = CustomMTSAC( env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, sampler=sampler, train_task_sampler=train_task_sampler, test_sampler=test_sampler, gradient_steps_per_itr=int(max_episode_length * params["training"]["num_grad_steps_scale"]), num_tasks=num_tasks, min_buffer_size=max_episode_length * num_tasks, target_update_tau=params["training"]["target_update_tau"], discount=params["general_setting"]["discount"], buffer_batch_size=params["training"]["buffer_batch_size"], policy_lr=params["training"]["policy_lr"], qf_lr=params["training"]["qf_lr"], reward_scale=params["training"]["reward_scale"], num_evaluation_episodes=params["general_setting"]["eval_episodes"], task_update_frequency=params["training"]["task_update_frequency"], wandb_logging=use_wandb, evaluation_frequency=params["general_setting"]["evaluation_frequency"] ) if gpu is not None: set_gpu_mode(True, gpu) mtsac.to() trainer.setup(algo=mtsac, env=mt50_train_envs) trainer.train(n_epochs=epochs, batch_size=steps_between_updates)
def mtsac_metaworld_mt10(ctxt=None, *, experiment_name, config_pth, seed, use_wandb, gpu): """Train MTSAC with MT10 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). timesteps (int): Number of timesteps to run. """ print(f"Initiation took {time()-t0:.2f} secs") device = torch.device("cuda") if gpu else torch.device("cpu") print(f"Using GPU: {gpu}, Device: {device}") # maybe overring other things - this is required, why? if gpu: set_gpu_mode(True) else: set_gpu_mode(False) # Get experiment parameters (e.g. hyperparameters) and save the json file params = get_params(config_pth) with open(ctxt.snapshot_dir + "/params.json", "w") as json_file: json.dump(params, json_file) if use_wandb == "True": use_wandb = True wandb.init( name=experiment_name, project="mt10_debug", group="Baselines{}".format("mt10"), reinit=True, config=params, ) else: use_wandb = False num_tasks = params["net"]["num_tasks"] timesteps = 15000000 deterministic.set_seed(seed) trainer = Trainer(ctxt) # Note: different classes whether it uses 10 or 50 tasks. Why? if num_tasks <= 10: mt_env = metaworld.MT10() else: mt_env = metaworld.MT50() train_task_sampler = MetaWorldTaskSampler(mt_env, "train", add_env_onehot=True) assert num_tasks % 10 == 0, "Number of tasks have to divisible by 10" assert num_tasks <= 500, "Number of tasks should be less or equal 500" mt_train_envs = train_task_sampler.sample(num_tasks) env = mt_train_envs[0]() params["net"]["policy_min_std"] = np.exp( params["net"]["policy_min_log_std"]) params["net"]["policy_max_std"] = np.exp( params["net"]["policy_max_log_std"]) policy = create_policy_net(env_spec=env.spec, net_params=params["net"]) print("Created policy") qf1 = create_qf_net(env_spec=env.spec, net_params=params["net"]) qf2 = create_qf_net(env_spec=env.spec, net_params=params["net"]) print("Created value functions") replay_buffer = PathBuffer(capacity_in_transitions=int( params["general_setting"]["num_buffer_transitions"])) max_episode_length = env.spec.max_episode_length # Note: are the episode length the same among all tasks? sampler = RaySampler( agents=policy, envs=mt_train_envs, max_episode_length=max_episode_length, cpus_per_worker=params["sampler"]["cpus_per_worker"], gpus_per_worker=params["sampler"]["gpus_per_worker"], seed=None, # set to get_seed() to make it deterministic ) # will probably still need the sampler test_sampler = sampler # test_sampler = RaySampler( # agents=policy, # envs=mt_train_envs, # max_episode_length=max_episode_length, # # 1 sampler worker for each environment # n_workers=num_tasks, # worker_class=EvalWorker # ) # Note: difference between sampler and test sampler is only the worker # difference is one line in EvalWorker, uses average: a = agent_info["mean"] # can we create a unified worker that cointais both rules? # Number of transitions before a set of gradient updates # Note: should we use avg episode length, if they are not same for all tasks? batch_size = int(max_episode_length * num_tasks) # TODO: this whole block seems unnecessary, it is not doing anything. # Number of times policy is evaluated (also the # of epochs) num_evaluation_points = timesteps // batch_size epochs = timesteps // batch_size # number of times new batch of samples + gradient updates are done per epoch epoch_cycles = epochs // num_evaluation_points # this will always be equal to 1 epochs = epochs // epoch_cycles mtsac = CustomMTSAC( env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, sampler=sampler, train_task_sampler=train_task_sampler, test_sampler=test_sampler, gradient_steps_per_itr=1, num_tasks=num_tasks, steps_per_epoch=epoch_cycles, min_buffer_size=max_episode_length * num_tasks, target_update_tau=params["training"]["target_update_tau"], discount=params["general_setting"]["discount"], buffer_batch_size=params["training"]["buffer_batch_size"], policy_lr=params["training"]["policy_lr"], qf_lr=params["training"]["qf_lr"], reward_scale=params["training"]["reward_scale"], num_evaluation_episodes=params["general_setting"]["eval_episodes"], task_update_frequency=params["training"]["task_update_frequency"], wandb_logging=use_wandb, evaluation_frequency=params["general_setting"]["evaluation_frequency"], ) print("Created algo") mtsac.to(device=device) print("Moved networks to device") trainer.setup(algo=mtsac, env=mt_train_envs) print("Setup trainer") trainer.train(n_epochs=epochs, batch_size=batch_size)