def run(lock, shared_eps_num, shared_eps_reward, hyper_param): model = SAC( env_name='Pendulum-v0', load_dir='./ckpt/ckpt_' + str(hyper_param[0]) + '_' + str(hyper_param[1]), log_dir='./log/log_' + str(hyper_param[0]) + '_' + str(hyper_param[1]), buffer_size=1e6, seed=hyper_param[1], max_episode_steps=500, # manual set batch_size=hyper_param[0], discount=0.99, learning_starts=500, tau=0.005, save_eps_num=100) timesteps = 0 total_timesteps = 1e5 max_eps_steps = 100 # train while timesteps < total_timesteps: episode_reward = 0 done = False eps_steps = 0 obs = model.env.reset() while not done and eps_steps < max_eps_steps: action = model.predict(obs) new_obs, reward, done, info = model.env.step(action) model.replay_buffer.push(obs, action, reward, new_obs, done) obs = new_obs episode_reward += reward eps_steps += 1 timesteps += 1 if timesteps > model.learning_starts: model.train_step() model.episode_num += 1 model.writer.add_scalar('episode_reward', episode_reward, model.episode_num) lock.acquire() shared_eps_num.value = model.episode_num shared_eps_reward.value = episode_reward lock.release()
from env import Env import numpy as np # from rllite.common import choose_gpu, GymDelay # choose your GPU if you have more than one # choose_gpu(0) env = Env(10666) # set model = SAC( external_env=env, # import your env env_name="ssl_env", # your env name load_dir='./ckpt', log_dir="./log", buffer_size=1e6, seed=2, max_episode_steps=500, # manual set batch_size=64, discount=0.99, learning_starts=1000, tau=0.005, save_eps_num=100) # model.learn(1e6) timesteps = 0 total_timesteps = 1e6 max_eps_steps = 500 # tricky limit xy_acc = 4.0 / 75.0 / 2
from rllite import SAC model = SAC('Pendulum-v0').learn(1e6)
from rllite import SAC # set model = SAC(env_name='Pendulum-v0', load_dir='./ckpt', log_dir="./log", buffer_size=1e6, seed=1, max_episode_steps=None, batch_size=64, discount=0.99, learning_starts=500, tau=0.005, save_eps_num=100) # train model.learn(1e6) # eval for _ in range(10): done = False obs = model.env.reset() while not done: action = model.predict(obs) obs, reward, done, info = model.env.step(action) model.env.render() model.env.close()