def mtppo_metaworld_mt10(ctxt, experiment_name, config_pth, seed, n_workers, n_tasks, use_wandb,
                         wandb_username, use_gpu):
    """Set up environment and algorithm and run the task.
    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        epochs (int): Number of training epochs.
        batch_size (int): Number of environment steps in one batch.
        n_workers (int): The number of workers the sampler should use.
        n_tasks (int): Number of tasks to use. Should be a multiple of 10.
    """
    params = get_params(config_pth)
    set_seed(seed)
    mt10 = metaworld.MT10()
    train_task_sampler = MetaWorldTaskSampler(mt10,
                                              'train',
                                              lambda env, _: normalize(env),
                                              add_env_onehot=True)

    if use_wandb == 'True':
        use_wandb = True
        wandb.init(
            name=experiment_name,
            entity=wandb_username,
            project="mt10",
            group="Baselines{}".format("mt10"),
            reinit=True,
            config=params,
        )
    else:
        use_wandb = False

    assert n_tasks % 10 == 0
    assert n_tasks <= 500

    envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)]
    env = envs[0]

    policy = create_policy_net(env_spec=env.spec,
                               net_params=params["net"]
                               )
    value_function = create_vf_net(env_spec=env.spec,
                                   net_params=params["net"]
                                   )

    sampler = RaySampler(agents=policy,
                         envs=env,
                         max_episode_length=env.spec.max_episode_length,
                         n_workers=n_workers,
                         worker_class=DefaultWorker)

    gpu_training = True if use_gpu else False

    algo = CustomMTPPO(env_spec=env.spec,
                       policy=policy,
                       value_function=value_function,
                       sampler=sampler,
                       train_task_sampler=train_task_sampler,
                       num_tasks=n_tasks,
                       task_update_frequency=params["training"]["task_update_frequency"],
                       num_eval_eps=params["general_setting"]["eval_episodes"],
                       policy_lr=params["training"]["policy_lr"],
                       vf_lr=params["training"]["vf_lr"],
                       ppo_eps=params["training"]["ppo_eps"],
                       minibatch_size=params["training"]["minibatch_size"],
                       ppo_epochs=params["training"]["ppo_epochs"],
                       num_train_per_epoch=params["training"]["num_train_per_epoch"],
                       discount=params["general_setting"]["discount"],
                       gae_lambda=params["training"]["gae_lambda"],
                       center_adv=False,
                       wandb_logging=use_wandb,
                       eval_freq=params["general_setting"]["eval_freq"],
                       stop_entropy_gradient=True,
                       entropy_method='max',
                       gpu_training=gpu_training
                       )

    trainer = Trainer(ctxt)
    trainer.setup(algo, env)
    trainer.train(n_epochs=params["training"]["epochs"],
                  batch_size=params["training"]["batch_episodes_per_task"],
                  plot=False)
def mtsac_metaworld_mt50(
    ctxt=None, *, config_pth, seed, timesteps, use_wandb, wandb_project_name, gpu
):
    """Train MTSAC with MT50 environment.
    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        _gpu (int): The ID of the gpu to be used (used on multi-gpu machines).
        num_tasks (int): Number of tasks to use. Should be a multiple of 10.
        timesteps (int): Number of timesteps to run.
    """
    """Train MTSAC with metaworld_experiments environment.
    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        _gpu (int): The ID of the gpu to be used (used on multi-gpu machines).
        timesteps (int): Number of timesteps to run.
    """
    print(f"Initiation took {time() - t0:.2f} secs")

    # Get experiment parameters (e.g. hyperparameters) and save the json file
    params = get_params(config_pth)

    with open(ctxt.snapshot_dir + "/params.json", "w") as json_file:
        json.dump(params, json_file)

    if use_wandb == "True":
        use_wandb = True
        wandb.init(
            name=params["experiment_name"],
            project=wandb_project_name,
            group="Baselines{}".format("mt50"),
            reinit=True,
            config=params,
        )
    else:
        use_wandb = False

    num_tasks = 50
    timesteps = timesteps
    deterministic.set_seed(seed)
    trainer = CustomTrainer(ctxt)
    mt10 = metaworld.MT50()

    train_task_sampler = MetaWorldTaskSampler(mt10, "train", add_env_onehot=True)

    assert num_tasks % 10 == 0, "Number of tasks have to divisible by 10"
    assert num_tasks <= 500, "Number of tasks should be less or equal 500"
    mt50_train_envs = train_task_sampler.sample(num_tasks)
    env = mt50_train_envs[0]()

    params["net"]["policy_min_std"] = np.exp(params["net"]["policy_min_log_std"])
    params["net"]["policy_max_std"] = np.exp(params["net"]["policy_max_log_std"])

    policy = create_policy_net(env_spec=env.spec, net_params=params["net"])
    qf1 = create_qf_net(env_spec=env.spec, net_params=params["net"])
    qf2 = create_qf_net(env_spec=env.spec, net_params=params["net"])

    replay_buffer = PathBuffer(
        capacity_in_transitions=int(params["general_setting"]["num_buffer_transitions"])
    )
    max_episode_length = env.spec.max_episode_length
    # Note: are the episode length the same among all tasks?

    sampler = RaySampler(
        agents=policy,
        envs=mt50_train_envs,
        max_episode_length=max_episode_length,
        # 1 sampler worker for each environment
        n_workers=num_tasks,
        worker_class=DefaultWorker
    )

    test_sampler = RaySampler(
        agents=policy,
        envs=mt50_train_envs,
        max_episode_length=max_episode_length,
        # 1 sampler worker for each environment
        n_workers=num_tasks,
        worker_class=EvalWorker
    )

    # Number of transitions before a set of gradient updates
    steps_between_updates = int(max_episode_length * num_tasks)

    # epoch: 1 cycle of data collection + gradient updates
    epochs = timesteps // steps_between_updates

    mtsac = CustomMTSAC(
        env_spec=env.spec,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        replay_buffer=replay_buffer,
        sampler=sampler,
        train_task_sampler=train_task_sampler,
        test_sampler=test_sampler,
        gradient_steps_per_itr=int(max_episode_length * params["training"]["num_grad_steps_scale"]),
        num_tasks=num_tasks,
        min_buffer_size=max_episode_length * num_tasks,
        target_update_tau=params["training"]["target_update_tau"],
        discount=params["general_setting"]["discount"],
        buffer_batch_size=params["training"]["buffer_batch_size"],
        policy_lr=params["training"]["policy_lr"],
        qf_lr=params["training"]["qf_lr"],
        reward_scale=params["training"]["reward_scale"],
        num_evaluation_episodes=params["general_setting"]["eval_episodes"],
        task_update_frequency=params["training"]["task_update_frequency"],
        wandb_logging=use_wandb,
        evaluation_frequency=params["general_setting"]["evaluation_frequency"]
    )

    if gpu is not None:
        set_gpu_mode(True, gpu)

    mtsac.to()
    trainer.setup(algo=mtsac, env=mt50_train_envs)
    trainer.train(n_epochs=epochs, batch_size=steps_between_updates)
Пример #3
0
def mtsac_metaworld_mt10(ctxt=None,
                         *,
                         experiment_name,
                         config_pth,
                         seed,
                         use_wandb,
                         gpu):
    """Train MTSAC with MT10 environment.
    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        _gpu (int): The ID of the gpu to be used (used on multi-gpu machines).
        timesteps (int): Number of timesteps to run.
    """
    print(f"Initiation took {time()-t0:.2f} secs")

    device = torch.device("cuda") if gpu else torch.device("cpu")
    print(f"Using GPU: {gpu}, Device: {device}")

    # maybe overring other things - this is required, why?
    if gpu:
        set_gpu_mode(True)
    else:
        set_gpu_mode(False)

    # Get experiment parameters (e.g. hyperparameters) and save the json file
    params = get_params(config_pth)

    with open(ctxt.snapshot_dir + "/params.json", "w") as json_file:
        json.dump(params, json_file)

    if use_wandb == "True":
        use_wandb = True
        wandb.init(
            name=experiment_name,
            project="mt10_debug",
            group="Baselines{}".format("mt10"),
            reinit=True,
            config=params,
        )
    else:
        use_wandb = False

    num_tasks = params["net"]["num_tasks"]
    timesteps = 15000000
    deterministic.set_seed(seed)
    trainer = Trainer(ctxt)

    # Note: different classes whether it uses 10 or 50 tasks. Why?
    if num_tasks <= 10:
        mt_env = metaworld.MT10()
    else:
        mt_env = metaworld.MT50()

    train_task_sampler = MetaWorldTaskSampler(mt_env,
                                              "train",
                                              add_env_onehot=True)

    assert num_tasks % 10 == 0, "Number of tasks have to divisible by 10"
    assert num_tasks <= 500, "Number of tasks should be less or equal 500"
    mt_train_envs = train_task_sampler.sample(num_tasks)
    env = mt_train_envs[0]()

    params["net"]["policy_min_std"] = np.exp(
        params["net"]["policy_min_log_std"])
    params["net"]["policy_max_std"] = np.exp(
        params["net"]["policy_max_log_std"])

    policy = create_policy_net(env_spec=env.spec, net_params=params["net"])
    print("Created policy")

    qf1 = create_qf_net(env_spec=env.spec, net_params=params["net"])
    qf2 = create_qf_net(env_spec=env.spec, net_params=params["net"])
    print("Created value functions")

    replay_buffer = PathBuffer(capacity_in_transitions=int(
        params["general_setting"]["num_buffer_transitions"]))
    max_episode_length = env.spec.max_episode_length
    # Note: are the episode length the same among all tasks?

    sampler = RaySampler(
        agents=policy,
        envs=mt_train_envs,
        max_episode_length=max_episode_length,
        cpus_per_worker=params["sampler"]["cpus_per_worker"],
        gpus_per_worker=params["sampler"]["gpus_per_worker"],
        seed=None,  # set to get_seed() to make it deterministic
    )

    # will probably still need the sampler
    test_sampler = sampler
    # test_sampler = RaySampler(
    #     agents=policy,
    #     envs=mt_train_envs,
    #     max_episode_length=max_episode_length,
    #     # 1 sampler worker for each environment
    #     n_workers=num_tasks,
    #     worker_class=EvalWorker
    # )

    # Note:  difference between sampler and test sampler is only the worker
    # difference is one line in EvalWorker, uses average: a = agent_info["mean"]
    # can we create a unified worker that cointais both rules?

    # Number of transitions before a set of gradient updates
    # Note: should we use avg episode length, if they are not same for all tasks?
    batch_size = int(max_episode_length * num_tasks)

    # TODO: this whole block seems unnecessary, it is not doing anything.
    # Number of times policy is evaluated (also the # of epochs)
    num_evaluation_points = timesteps // batch_size
    epochs = timesteps // batch_size
    # number of times new batch of samples + gradient updates are done per epoch
    epoch_cycles = epochs // num_evaluation_points  # this will always be equal to 1
    epochs = epochs // epoch_cycles

    mtsac = CustomMTSAC(
        env_spec=env.spec,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        replay_buffer=replay_buffer,
        sampler=sampler,
        train_task_sampler=train_task_sampler,
        test_sampler=test_sampler,
        gradient_steps_per_itr=1,
        num_tasks=num_tasks,
        steps_per_epoch=epoch_cycles,
        min_buffer_size=max_episode_length * num_tasks,
        target_update_tau=params["training"]["target_update_tau"],
        discount=params["general_setting"]["discount"],
        buffer_batch_size=params["training"]["buffer_batch_size"],
        policy_lr=params["training"]["policy_lr"],
        qf_lr=params["training"]["qf_lr"],
        reward_scale=params["training"]["reward_scale"],
        num_evaluation_episodes=params["general_setting"]["eval_episodes"],
        task_update_frequency=params["training"]["task_update_frequency"],
        wandb_logging=use_wandb,
        evaluation_frequency=params["general_setting"]["evaluation_frequency"],
    )

    print("Created algo")

    mtsac.to(device=device)
    print("Moved networks to device")

    trainer.setup(algo=mtsac, env=mt_train_envs)
    print("Setup trainer")

    trainer.train(n_epochs=epochs, batch_size=batch_size)