def train_on_env(model, env_name, episodes): # Initialize an optimizer for model and a summary writer opt = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE, betas = BETAS) model.train() writer = SummaryWriter() env = gym.make(env_name) next_s = ops.prep_state(env.reset()) # initial state to send to sample for update in range(UPDATES): samples, total_reward, next_s = sample(model, env, next_s) total_loss = PPOtrain(model, opt, samples) writer.add_scalar("Loss", total_loss, update) writer.add_scalar("Total Reward", total_reward, update)
def sample(model, env, steps, start_s): store = Rollout_Storage() total_r = 0 s = start_s if s is None: s = ops.prep_state(env.reset()) for step in range(steps): pi, logits, v = model(s) a = pi.sample() # Sample action log_p = pi.log_prob(a) # Log prob of "a" being selected next_s, r, d = ops.mod_step(env, a) total_r += r.item() store.add([s, next_s, a, log_p, r, d]) s = next_s return store, total_r, s
SPF = 1 / FPS env = gym.make(ENV_NAME) agent = Agent() try: agent.model.load_state_dict(torch.load("params.pt")) print("Loaded checkpoint") except: print("Could not load checkpoint") while True: total_r = 0 s = env.reset() s = prep_state(s) for i in range(TIME_LIMIT): time.sleep(SPF) env.render() a = agent.act(s, explore=False) s, r, done, _ = env.step(a) s = prep_state(s) total_r += r if done: break print(total_r)
# Try loading previous agent if LOAD_CHECKPOINTS: try: agent.model.load_state_dict(torch.load("params.pt")) print("Loaded checkpoint") except: print("Could not load checkpoint") scores = [] step = 0 for e in range(EPISODES): total_r = 0 s = env.reset() s = prep_state(s) q_loss = 0 for t in range(TIME_LIMIT): env.render() a = agent.act(s) s_new, r, done, _ = env.step(a) r = s_new[1] total_r += r s_new = prep_state(s_new) agent.add_exp([s, a, r, s_new, done]) s = s_new
from ops import prep_state import numpy as np import torch import gym env = gym.make('Breakout-v0') s = env.reset() s_prep = prep_state(s) print(s_prep.shape)