input_size = 4 output_size = 1 layer_size = 0 num_layers = 0 activation = nn.ReLU proc_list = [] for seed in range(200)[-7:]: # policy = MLP(input_size, output_size, num_layers, layer_size, activation) policy = torch.load("warm/LQR_policy") value_fn = MLP(input_size, 1, num_layers, layer_size, activation) model = PPOModel( policy=policy, value_fn = value_fn, discrete=False, # hold_count = 0 ) def reward_fn(ns, act): return -1e-2*((ns[0] - np.pi)**2 + ns[1]**2 + .1*ns[2]**2 + .2*ns[3]**2) #return 1e-2*(np.cos(ns[0]) + np.cos(ns[0] + ns[1])) env_config = { "max_torque" : 25, "init_state" : [np.pi, 0.0, 0.0, 0.0], "init_state_weights" : np.array([0, 0, 0, 0]), "dt" : .02, "max_t" : 1, "act_hold" : 1, "fixed_step" : True,
torch.set_default_dtype(torch.double) proc_list = [] for seed in [0]: policy = MLP(input_size, output_size, num_layers, layer_size, activation) # model = PPOModelActHold( # policy=policy, # value_fn=MLP(input_size, 1, num_layers, layer_size, activation), # discrete=False, # hold_count = 200 # ) model = PPOModel(policy=policy, value_fn=MLP(input_size, 1, num_layers, layer_size, activation), discrete=False) arg_dict = { "env_name": env_name, "model": model, "action_var_schedule": [1, 1], "seed": seed, # int((time.time() % 1)*1e8), "num_epochs": 1000, "epoch_batch_size": 2048, "gamma": 1, "p_epochs": 10, "v_epochs": 10, } run_name = "ppo" + str(seed)
from seagul.old.sac.sac_ray import ray_sac from seagul.nn import MLP from seagul.rl.models import SACModel, PPOModel input_size = 17 output_size = 6 layer_size = 64 num_layers = 2 policy = MLP(input_size, output_size * 2, num_layers, layer_size) value_fn = MLP(input_size, 1, num_layers, layer_size) q1_fn = MLP(input_size + output_size, 1, num_layers, layer_size) q2_fn = MLP(input_size + output_size, 1, num_layers, layer_size) model = SACModel(policy, value_fn, q1_fn, q2_fn, 3) ppo_policy = MLP(input_size, output_size, num_layers, layer_size) ppo_model = PPOModel(ppo_policy, value_fn) env_name = "Walker2d-v2" model, rews, var_dict = ray_sac(env_name, 100000, model, env_steps=1000, iters_per_update=100, min_steps_per_update=100, reward_stop=1000, exploration_steps=1000) #model, rews, var_dict = ppo(env_name, 3e5, ppo_model) globals().update(var_dict)
return reward, done env_config1 = { "init_state": [0, 0, 0, 0], "max_torque": max_torque, "init_state_weights": [0, 0, 0, 0], "dt": .01, "reward_fn": reward_fn, "max_t": max_t, "act_hold": 20 } policy = MLP(input_size, output_size, num_layers, layer_size, activation) value_fn = MLP(input_size, 1, num_layers, layer_size, activation) model1 = PPOModel(policy=policy, value_fn=value_fn, action_var=1) env_config2 = { "init_state": [0, 0, 0, 0], "max_torque": max_torque, "init_state_weights": [0, 0, 0, 0], "dt": .01, "reward_fn": reward_fn, "max_t": max_t, "act_hold": 1 } model2 = PPOModelActHold(policy=policy, value_fn=value_fn, action_var=1, hold_count=20)
from seagul.rl.algos import ppo from seagul.nn import MLP from seagul.rl.models import PPOModel import torch torch.set_default_dtype(torch.double)# TODO need to update everything to support arbitrary dtypes input_size = 3 output_size = 1 layer_size = 64 num_layers = 2 policy = MLP(input_size, output_size, num_layers, layer_size) value_fn = MLP(input_size, 1, num_layers, layer_size) model = PPOModel(policy, value_fn) model, rews, var_dict = ppo("Pendulum-v0", 10000, model)
for seed in [0,1,2,3]: policy = MLP(input_size, output_size, num_layers, layer_size, activation) value_fn = MLP(input_size, 1, num_layers, layer_size, activation) # model = PPOModelActHold( # policy=policy, # value_fn=MLP(input_size, 1, num_layers, layer_size, activation), # discrete=False, # hold_count = 10 # ) model = PPOModel(policy=policy,value_fn=value_fn , discrete=False) def reward_fn(s): if s[3] == 1: if s[0] > 2 and s[2] > 3: reward = 5.0 s[3] = 0 else: reward = 0.0 elif s[3] == 0: if s[0] < -2 and s[2] < -3: reward = 5.0 s[3] = 1 else: