def algo_init(args): logger.info('Run algo_init function') setup_seed(args['seed']) if args["obs_shape"] and args["action_shape"]: obs_shape, action_shape = args["obs_shape"], args["action_shape"] max_action = args["max_action"] elif "task" in args.keys(): from offlinerl.utils.env import get_env_shape, get_env_action_range obs_shape, action_shape = get_env_shape(args['task']) max_action, _ = get_env_action_range(args["task"]) args["obs_shape"], args["action_shape"] = obs_shape, action_shape else: raise NotImplementedError net_a = Net(layer_num=args['actor_layers'], state_shape=obs_shape, hidden_layer_size=args['actor_features']) actor = TanhGaussianPolicy(preprocess_net=net_a, action_shape=action_shape, hidden_layer_size=args['actor_features'], conditioned_sigma=True).to(args['device']) actor_optim = torch.optim.Adam(actor.parameters(), lr=args['actor_lr']) return { "actor" : {"net" : actor, "opt" : actor_optim}, }
def launch_ope(config): ''' run on a seed ''' setup_seed(config['seed']) if config['ope'] == 'fqe': evaluator = FQEEvaluator() elif config['ope'] == 'is': evaluator = ISEvaluator() train_dataset, val_dataset = get_neorl_datasets(config["domain"], config['level'], config['amount']) evaluator.initialize(train_dataset=train_dataset, val_dataset=val_dataset) exp_folder = os.path.join(config['task_folder'], config['exp_name']) with open(os.path.join(exp_folder, 'metric_logs.json'), 'r') as f: metrics = json.load(f) max_step = str(max(map(int, metrics.keys()))) gt = metrics[max_step]['Reward_Mean_Env'] policy_file = os.path.join(exp_folder, 'models', f'{max_step}.pt') policy = torch.load(policy_file) ope = evaluator(policy) return { 'gt': gt, 'ope': ope, 'policy_file': policy_file, 'exp_name': config['exp_name'], 'seed': config['seed'], }
def algo_init(args): logger.info('Run algo_init function') setup_seed(args['seed']) if args["obs_shape"] and args["action_shape"]: obs_shape, action_shape = args["obs_shape"], args["action_shape"] elif "task" in args.keys(): from offlinerl.utils.env import get_env_shape obs_shape, action_shape = get_env_shape(args['task']) args["obs_shape"], args["action_shape"] = obs_shape, action_shape else: raise NotImplementedError if isinstance(args["obs_shape"], int): state_dim = (4, 84, 84) critic = Conv_Q(state_dim[0], args["action_shape"]).to(args['device']) else: critic = FC_Q(np.prod(args["obs_shape"]), args["action_shape"]).to(args['device']) critic_opt = optim.Adam(critic.parameters(), **args["optimizer_parameters"]) nets = { "critic": { "net": critic, "opt": critic_opt }, } return nets
def algo_init(args): logger.info('Run algo_init function') setup_seed(args['seed']) if args["obs_shape"] and args["action_shape"]: obs_shape, action_shape = args["obs_shape"], args["action_shape"] max_action = args["max_action"] elif "task" in args.keys(): from offlinerl.utils.env import get_env_shape, get_env_action_range obs_shape, action_shape = get_env_shape(args['task']) max_action, _ = get_env_action_range(args["task"]) args["obs_shape"], args["action_shape"] = obs_shape, action_shape else: raise NotImplementedError vae = VAE(obs_shape, action_shape, args['vae_features'], args['vae_layers'], max_action).to(args['device']) vae_optim = torch.optim.Adam(vae.parameters(), lr=args['vae_lr']) jitter = Jitter(obs_shape, action_shape, args['jitter_features'], args['jitter_layers'], max_action, args['phi']).to(args['device']) jitter_optim = torch.optim.Adam(jitter.parameters(), lr=args['jitter_lr']) q1 = MLP(obs_shape + action_shape, 1, args['value_features'], args['value_layers'], hidden_activation='relu').to(args['device']) q2 = MLP(obs_shape + action_shape, 1, args['value_features'], args['value_layers'], hidden_activation='relu').to(args['device']) critic_optim = torch.optim.Adam([*q1.parameters(), *q2.parameters()], lr=args['critic_lr']) return { "vae": { "net": vae, "opt": vae_optim }, "jitter": { "net": jitter, "opt": jitter_optim }, "critic": { "net": [q1, q2], "opt": critic_optim }, }
def algo_init(args): logger.info('Run algo_init function') setup_seed(args['seed']) if args["obs_shape"] and args["action_shape"]: obs_shape, action_shape = args["obs_shape"], args["action_shape"] elif "task" in args.keys(): from offlinerl.utils.env import get_env_shape obs_shape, action_shape = get_env_shape(args['task']) args["obs_shape"], args["action_shape"] = obs_shape, action_shape else: raise NotImplementedError transition = EnsembleTransition(obs_shape, action_shape, args['hidden_layer_size'], args['transition_layers'], args['transition_init_num']).to(args['device']) transition_optim = torch.optim.Adam(transition.parameters(), lr=args['transition_lr'], weight_decay=0.000075) net_a = Net(layer_num=args['hidden_layers'], state_shape=obs_shape, hidden_layer_size=args['hidden_layer_size']) actor = TanhGaussianPolicy(preprocess_net=net_a, action_shape=action_shape, hidden_layer_size=args['hidden_layer_size'], conditioned_sigma=True).to(args['device']) actor_optim = torch.optim.Adam(actor.parameters(), lr=args['actor_lr']) log_alpha = torch.zeros(1, requires_grad=True, device=args['device']) alpha_optimizer = torch.optim.Adam([log_alpha], lr=args["actor_lr"]) q1 = MLP(obs_shape + action_shape, 1, args['hidden_layer_size'], args['hidden_layers'], norm=None, hidden_activation='swish').to(args['device']) q2 = MLP(obs_shape + action_shape, 1, args['hidden_layer_size'], args['hidden_layers'], norm=None, hidden_activation='swish').to(args['device']) critic_optim = torch.optim.Adam([*q1.parameters(), *q2.parameters()], lr=args['actor_lr']) return { "transition" : {"net" : transition, "opt" : transition_optim}, "actor" : {"net" : actor, "opt" : actor_optim}, "log_alpha" : {"net" : log_alpha, "opt" : alpha_optimizer}, "critic" : {"net" : [q1, q2], "opt" : critic_optim}, }
def training_dynamics(config): if config["task"] == 'finance' and config["amount"] == 10000: return { 'performance': [], 'path': '', } seed = config['seed'] setup_seed(seed) train_buffer, val_buffer = load_data_from_neorl(config["task"], config["level"], config["amount"]) obs_shape = train_buffer['obs'].shape[-1] action_shape = train_buffer['act'].shape[-1] device = 'cuda' hidden_units = 1024 if config["task"] in ['ib', 'finance', 'citylearn' ] else 256 transition = EnsembleTransition(obs_shape, action_shape, hidden_units, 4, 7).to(device) transition_optim = torch.optim.AdamW(transition.parameters(), lr=1e-3, weight_decay=0.000075) data_size = len(train_buffer) val_size = min(int(data_size * 0.2) + 1, 1000) train_size = data_size - val_size train_splits, val_splits = torch.utils.data.random_split( range(data_size), (train_size, val_size)) valdata = train_buffer[val_splits.indices] train_buffer = train_buffer[train_splits.indices] batch_size = 256 val_losses = [float('inf') for i in range(7)] epoch = 0 cnt = 0 while True: epoch += 1 idxs = np.random.randint(train_buffer.shape[0], size=[7, train_buffer.shape[0]]) for batch_num in range(int(np.ceil(idxs.shape[-1] / batch_size))): batch_idxs = idxs[:, batch_num * batch_size:(batch_num + 1) * batch_size] batch = train_buffer[batch_idxs] _train_transition(transition, batch, transition_optim, device) new_val_losses = _eval_transition(transition, valdata, device) indexes = [] for i, new_loss, old_loss in zip(range(len(val_losses)), new_val_losses, val_losses): if new_loss < old_loss: indexes.append(i) val_losses[i] = new_loss if len(indexes) > 0: transition.update_save(indexes) cnt = 0 else: cnt += 1 if cnt >= 5: break indexes = _select_best_indexes(val_losses, n=5) transition.set_select(indexes) performance = _eval_transition(transition, valdata, device) transition_path = os.path.join( config['dynamics_path'], f'{config["task"]}-{config["level"]}-{config["amount"]}-{seed}.pt') torch.save(transition, transition_path) return { 'performance': performance, 'path': transition_path, }
def algo_init(args): logger.info('Run algo_init function') setup_seed(args['seed']) if args["obs_shape"] and args["action_shape"]: obs_shape, action_shape = args["obs_shape"], args["action_shape"] elif "task" in args.keys(): from offlinerl.utils.env import get_env_shape obs_shape, action_shape = get_env_shape(args['task']) args["obs_shape"], args["action_shape"] = obs_shape, action_shape else: raise NotImplementedError net_a = Net(layer_num=args['layer_num'], state_shape=obs_shape, hidden_layer_size=args['hidden_layer_size']) actor = TanhGaussianPolicy( preprocess_net=net_a, action_shape=action_shape, hidden_layer_size=args['hidden_layer_size'], conditioned_sigma=True, ).to(args['device']) actor_optim = optim.Adam(actor.parameters(), lr=args['actor_lr']) net_c1 = Net(layer_num=args['layer_num'], state_shape=obs_shape, action_shape=action_shape, concat=True, hidden_layer_size=args['hidden_layer_size']) critic1 = Critic( preprocess_net=net_c1, hidden_layer_size=args['hidden_layer_size'], ).to(args['device']) critic1_optim = optim.Adam(critic1.parameters(), lr=args['critic_lr']) net_c2 = Net(layer_num=args['layer_num'], state_shape=obs_shape, action_shape=action_shape, concat=True, hidden_layer_size=args['hidden_layer_size']) critic2 = Critic( preprocess_net=net_c2, hidden_layer_size=args['hidden_layer_size'], ).to(args['device']) critic2_optim = optim.Adam(critic2.parameters(), lr=args['critic_lr']) if args["use_automatic_entropy_tuning"]: if args["target_entropy"]: target_entropy = args["target_entropy"] else: target_entropy = -np.prod(args["action_shape"]).item() log_alpha = torch.zeros(1, requires_grad=True, device=args['device']) alpha_optimizer = optim.Adam( [log_alpha], lr=args["actor_lr"], ) nets = { "actor": { "net": actor, "opt": actor_optim }, "critic1": { "net": critic1, "opt": critic1_optim }, "critic2": { "net": critic2, "opt": critic2_optim }, "log_alpha": { "net": log_alpha, "opt": alpha_optimizer, "target_entropy": target_entropy }, } if args["lagrange_thresh"] >= 0: target_action_gap = args["lagrange_thresh"] log_alpha_prime = torch.zeros(1, requires_grad=True, device=args['device']) alpha_prime_optimizer = optim.Adam( [log_alpha_prime], lr=args["critic_lr"], ) nets.update({ "log_alpha_prime": { "net": log_alpha_prime, "opt": alpha_prime_optimizer } }) return nets
def algo_init(args): logger.info('Run algo_init function') setup_seed(args['seed']) if args["obs_shape"] and args["action_shape"]: obs_shape, action_shape = args["obs_shape"], args["action_shape"] max_action = args["max_action"] elif "task" in args.keys(): from offlinerl.utils.env import get_env_shape, get_env_action_range obs_shape, action_shape = get_env_shape(args['task']) max_action, _ = get_env_action_range(args["task"]) args["obs_shape"], args["action_shape"] = obs_shape, action_shape else: raise NotImplementedError latent_dim = action_shape *2 vae = VAE(state_dim = obs_shape, action_dim = action_shape, latent_dim = latent_dim, max_action = max_action, hidden_size=args["vae_hidden_size"]).to(args['device']) vae_opt = optim.Adam(vae.parameters(), lr=args["vae_lr"]) if args["latent"]: actor = ActorPerturbation(obs_shape, action_shape, latent_dim, max_action, max_latent_action=2, phi=args['phi']).to(args['device']) else: net_a = Net(layer_num = args["layer_num"], state_shape = obs_shape, hidden_layer_size = args["hidden_layer_size"]) actor = Actor(preprocess_net = net_a, action_shape = latent_dim, max_action = max_action, hidden_layer_size = args["hidden_layer_size"]).to(args['device']) actor_opt = optim.Adam(actor.parameters(), lr=args["actor_lr"]) net_c1 = Net(layer_num = args['layer_num'], state_shape = obs_shape, action_shape = action_shape, concat = True, hidden_layer_size = args['hidden_layer_size']) critic1 = Critic(preprocess_net = net_c1, hidden_layer_size = args['hidden_layer_size'], ).to(args['device']) critic1_opt = optim.Adam(critic1.parameters(), lr=args['critic_lr']) net_c2 = Net(layer_num = args['layer_num'], state_shape = obs_shape, action_shape = action_shape, concat = True, hidden_layer_size = args['hidden_layer_size']) critic2 = Critic(preprocess_net = net_c2, hidden_layer_size = args['hidden_layer_size'], ).to(args['device']) critic2_opt = optim.Adam(critic2.parameters(), lr=args['critic_lr']) return { "vae" : {"net" : vae, "opt" : vae_opt}, "actor" : {"net" : actor, "opt" : actor_opt}, "critic1" : {"net" : critic1, "opt" : critic1_opt}, "critic2" : {"net" : critic2, "opt" : critic2_opt}, }