def train_sac(experiment_config): seed = experiment_config[c.SEED] save_path = experiment_config.get(c.SAVE_PATH, None) buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) set_seed(seed) train_env = make_env(experiment_config[c.ENV_SETTING], seed) model = make_model(experiment_config[c.MODEL_SETTING]) buffer = make_buffer( experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False)) policy_opt = make_optimizer( model.policy_parameters, experiment_config[c.OPTIMIZER_SETTING][c.POLICY]) qs_opt = make_optimizer(model.qs_parameters, experiment_config[c.OPTIMIZER_SETTING][c.QS]) alpha_opt = make_optimizer([model.log_alpha], experiment_config[c.OPTIMIZER_SETTING][c.ALPHA]) aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], model, buffer, experiment_config) learning_algorithm = SAC(model=model, policy_opt=policy_opt, qs_opt=qs_opt, alpha_opt=alpha_opt, learn_alpha=experiment_config[c.LEARN_ALPHA], buffer=buffer, algo_params=experiment_config, aux_tasks=aux_tasks) load_model = experiment_config.get(c.LOAD_MODEL, False) if load_model: learning_algorithm.load_state_dict(torch.load(load_model)) agent = ACAgent(model=model, learning_algorithm=learning_algorithm, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) evaluation_env = None evaluation_agent = None if experiment_config.get(c.EVALUATION_FREQUENCY, 0): evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) evaluation_agent = ACAgent( model=model, learning_algorithm=None, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.SAC, cfg=experiment_config) train(agent=agent, evaluation_agent=evaluation_agent, train_env=train_env, evaluation_env=evaluation_env, buffer_preprocess=buffer_preprocessing, experiment_settings=experiment_config, summary_writer=summary_writer, save_path=save_path)
def make_auxiliary_tasks(tasks, model, buffer, cfg): aux_tasks = dict() if tasks is not None: for task_name, task_setting in tasks.items(): assert task_name not in aux_tasks if task_name == c.KOOPMAN: task_setting[c.MODEL_SETTING][c.KWARGS][c.LAYERS_DIM] = model.encoder.layers_dim decoder = make_model(task_setting[c.MODEL_SETTING]).to(task_setting[c.DEVICE]) dynamics = KoopmanDynamics(z_dim=task_setting[c.Z_DIM], u_dim=task_setting[c.U_DIM], device=task_setting[c.DEVICE]) aux_opt = make_optimizer(list(decoder.parameters()) + list(dynamics.parameters()), task_setting[c.OPTIMIZER_SETTING]) aux_tasks[c.KOOPMAN] = Koopman(rec_dim=task_setting[c.REC_DIM], batch_size=task_setting[c.BATCH_SIZE], decoder=decoder, encoder=model.encoder, dynamics=dynamics, opt=aux_opt, buffer=buffer, algo_params=cfg, reduction=task_setting[c.REDUCTION], loss_coef=task_setting[c.LOSS_COEF], device=task_setting[c.DEVICE]) else: raise NotImplementedError return AuxiliaryTasks(aux_tasks)
def train_bc(experiment_config): seed = experiment_config[c.SEED] save_path = experiment_config.get(c.SAVE_PATH, None) buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) set_seed(seed) train_env = FakeEnv(obs_dim=experiment_config[c.OBS_DIM]) model = make_model(experiment_config[c.MODEL_SETTING]) expert_buffer = make_buffer(experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False)) optimizer = make_optimizer(model.parameters(), experiment_config[c.OPTIMIZER_SETTING][c.POLICY]) aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], model, expert_buffer, experiment_config) learning_algorithm = BC(model=model, optimizer=optimizer, expert_buffer=expert_buffer, algo_params=experiment_config, aux_tasks=aux_tasks) load_model = experiment_config.get(c.LOAD_MODEL, False) if load_model: learning_algorithm.load_state_dict(torch.load(load_model)) agent = ACAgent(model=model, learning_algorithm=learning_algorithm, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) evaluation_env = None evaluation_agent = None if experiment_config.get(c.EVALUATION_FREQUENCY, 0): evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) evaluation_agent = ACAgent(model=model, learning_algorithm=None, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.BC, cfg=experiment_config) train(agent=agent, evaluation_agent=evaluation_agent, train_env=train_env, evaluation_env=evaluation_env, buffer_preprocess=buffer_preprocessing, experiment_settings=experiment_config, summary_writer=summary_writer, save_path=save_path)
def train_sac_diayn(experiment_config): seed = experiment_config[c.SEED] save_path = experiment_config.get(c.SAVE_PATH, None) buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) set_seed(seed) train_env = make_env(experiment_config[c.ENV_SETTING], seed) model = make_model(experiment_config[c.MODEL_SETTING]) discriminator = make_model(experiment_config[c.DISCRIMINATOR_SETTING]) prior = experiment_config[c.PRIOR] buffer = make_buffer( experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False)) policy_opt = make_optimizer( model.policy_parameters, experiment_config[c.OPTIMIZER_SETTING][c.POLICY]) qs_opt = make_optimizer(model.qs_parameters, experiment_config[c.OPTIMIZER_SETTING][c.QS]) alpha_opt = make_optimizer([model.log_alpha], experiment_config[c.OPTIMIZER_SETTING][c.ALPHA]) discriminator_opt = make_optimizer( discriminator.parameters(), experiment_config[c.OPTIMIZER_SETTING][c.DISCRIMINATOR]) aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], model, buffer, experiment_config) learning_algorithm = SACDIAYN(model=model, policy_opt=policy_opt, qs_opt=qs_opt, alpha_opt=alpha_opt, learn_alpha=experiment_config[c.LEARN_ALPHA], buffer=buffer, algo_params=experiment_config, aux_tasks=aux_tasks) diayn = DIAYN(discriminator=discriminator, prior=prior, discriminator_opt=discriminator_opt, learning_algorithm=learning_algorithm, algo_params=experiment_config) load_model = experiment_config.get(c.LOAD_MODEL, False) if load_model: learning_algorithm.load_state_dict(torch.load(load_model)) agent = DIAYNAgent( prior=prior, model=model, learning_algorithm=diayn, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) evaluation_env = None evaluation_agent = None if experiment_config.get(c.EVALUATION_FREQUENCY, 0): evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) evaluation_agent = DIAYNAgent( prior=prior, model=model, learning_algorithm=None, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) class GetTask: def __init__(self, agent): self.agent = agent def __call__(self, obs): # Concatenate task to the end of observation return np.concatenate((obs, self.agent.curr_high_level_act), axis=-1) def reset(self): pass buffer_preprocessing = Compose([buffer_preprocessing, GetTask(agent)]) summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.SAC, cfg=experiment_config) train(agent=agent, evaluation_agent=evaluation_agent, train_env=train_env, evaluation_env=evaluation_env, buffer_preprocess=buffer_preprocessing, experiment_settings=experiment_config, summary_writer=summary_writer, save_path=save_path)
def train_grac(experiment_config): seed = experiment_config[c.SEED] save_path = experiment_config.get(c.SAVE_PATH, None) buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) set_seed(seed) train_env = make_env(experiment_config[c.ENV_SETTING], seed) # experiment_config[c.MODEL_SETTING][c.KWARGS][c.CEM] = CEMQ(cov_noise_init=experiment_config[c.COV_NOISE_INIT], # cov_noise_end=experiment_config[c.COV_NOISE_END], # cov_noise_tau=experiment_config[c.COV_NOISE_TAU], # action_dim=experiment_config[c.ACTION_DIM], # batch_size=1, # num_iters=experiment_config[c.NUM_ITERS], # pop_size=experiment_config[c.POP_SIZE], # elite_size=experiment_config[c.ELITE_SIZE], # device=experiment_config[c.DEVICE], # min_action=experiment_config[c.MIN_ACTION], # max_action=experiment_config[c.MAX_ACTION]) model = make_model(experiment_config[c.MODEL_SETTING]) buffer = make_buffer( experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False)) # policy_opt = make_optimizer(model.policy_parameters, experiment_config[c.OPTIMIZER_SETTING]) policy_opt = make_optimizer( model.policy_parameters, experiment_config[c.OPTIMIZER_SETTING][c.POLICY]) qs_opt = make_optimizer(model.qs_parameters, experiment_config[c.OPTIMIZER_SETTING][c.QS]) aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], model, buffer, experiment_config) learning_algorithm = GRAC(model=model, policy_opt=policy_opt, qs_opt=qs_opt, buffer=buffer, algo_params=experiment_config, aux_tasks=aux_tasks) load_model = experiment_config.get(c.LOAD_MODEL, False) if load_model: learning_algorithm.load_state_dict(torch.load(load_model)) agent = ACAgent(model=model, learning_algorithm=learning_algorithm, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) evaluation_env = None evaluation_agent = None if experiment_config.get(c.EVALUATION_FREQUENCY, 0): evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) evaluation_agent = ACAgent( model=model, learning_algorithm=None, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.GRAC, cfg=experiment_config) train(agent=agent, evaluation_agent=evaluation_agent, train_env=train_env, evaluation_env=evaluation_env, buffer_preprocess=buffer_preprocessing, experiment_settings=experiment_config, summary_writer=summary_writer, save_path=save_path)
def train_sacx_sac_drq(experiment_config): seed = experiment_config[c.SEED] save_path = experiment_config.get(c.SAVE_PATH, None) buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) set_seed(seed) train_env = make_env(experiment_config[c.ENV_SETTING], seed) buffer = make_buffer(experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False)) intentions = make_model(experiment_config[c.INTENTIONS_SETTING]) policy_opt = make_optimizer(intentions.policy_parameters, experiment_config[c.OPTIMIZER_SETTING][c.INTENTIONS]) qs_opt = make_optimizer(intentions.qs_parameters, experiment_config[c.OPTIMIZER_SETTING][c.QS]) alpha_opt = make_optimizer([intentions.log_alpha], experiment_config[c.OPTIMIZER_SETTING][c.ALPHA]) aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], intentions, buffer, experiment_config) update_intentions = UpdateSACDrQIntentions(model=intentions, policy_opt=policy_opt, qs_opt=qs_opt, alpha_opt=alpha_opt, learn_alpha=experiment_config[c.LEARN_ALPHA], buffer=buffer, algo_params=experiment_config, aux_tasks=aux_tasks) scheduler = make_model(experiment_config[c.SCHEDULER_SETTING][c.TRAIN]) update_scheduler = UpdateQScheduler(model=scheduler, algo_params=experiment_config) learning_algorithm = SACX(update_scheduler=update_scheduler, update_intentions=update_intentions, algo_params=experiment_config) load_model = experiment_config.get(c.LOAD_MODEL, False) if load_model: learning_algorithm.load_state_dict(torch.load(load_model)) agent = SACXAgent(scheduler=scheduler, intentions=intentions, learning_algorithm=learning_algorithm, scheduler_period=experiment_config[c.SCHEDULER_SETTING][c.TRAIN][c.SCHEDULER_PERIOD], preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) evaluation_env = None evaluation_agent = None if experiment_config.get(c.EVALUATION_FREQUENCY, 0): evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) evaluation_agent = SACXAgent(scheduler=make_model(experiment_config[c.SCHEDULER_SETTING][c.EVALUATION]), intentions=intentions, learning_algorithm=None, scheduler_period=experiment_config[c.SCHEDULER_SETTING][c.EVALUATION][c.SCHEDULER_PERIOD], preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.SACX, cfg=experiment_config) train(agent=agent, evaluation_agent=evaluation_agent, train_env=train_env, evaluation_env=evaluation_env, buffer_preprocess=buffer_preprocessing, auxiliary_reward=experiment_config[c.AUXILIARY_REWARDS].reward, experiment_settings=experiment_config, summary_writer=summary_writer, save_path=save_path)