def run(): parser = argparse.ArgumentParser() parser.add_argument('--env_id', type=str, default='AntBulletEnv-v0') parser.add_argument('--log_name', type=str, default='') parser.add_argument('--cuda', action='store_true') parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() if args.log_name: log_dir = os.path.join('logs', args.env_id, args.log_name) else: env_dir = os.path.join('logs', args.env_id, '*') dirs = glob.glob(env_dir) log_dir = max(dirs, key=os.path.getctime) print(f'using {log_dir}') env = gym.make(args.env_id) device = torch.device( "cuda" if args.cuda and torch.cuda.is_available() else "cpu") policy = GaussianPolicy( env.observation_space.shape[0], env.action_space.shape[0], hidden_units=[256, 256]).to(device) policy.load(os.path.join(log_dir, 'model', 'policy.pth')) grad_false(policy) def exploit(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) with torch.no_grad(): _, _, action = policy.sample(state) return action.cpu().numpy().reshape(-1) env.render() while True: state = env.reset() episode_reward = 0. done = False while not done: env.render() action = exploit(state) next_state, reward, done, _ = env.step(action) episode_reward += reward state = next_state print(f'total reward: {episode_reward}') time.sleep(1)
def testing(): parser = argparse.ArgumentParser() parser.add_argument('--env_name', type=str, default='HalfCheetah-v2') parser.add_argument('--num_episode', type=int, default=10) args = parser.parse_args() num_episode = args.num_episode env = gym.make(args.env_name) device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") policy = GaussianPolicy( env.observation_space.shape[0], env.action_space.shape[0], hidden_units=[256, 256]).to(device) policy.load(os.path.join('models', args.env_name, 'policy.pth')) grad_false(policy) def exploit(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) with torch.no_grad(): _, _, action = policy.sample(state) return action.cpu().numpy().reshape(-1) e_rewrads = [] for _ in range(num_episode): state = env.reset() episode_reward = 0. done = False while not done: if num_episode <= 1: env.render() action = exploit(state) next_state, reward, done, _ = env.step(action) episode_reward += reward state = next_state e_rewrads.append(episode_reward) print("Average reward of " + args.env_name + " is %.1f"%(np.mean(e_rewrads))) print("Average std of " + args.env_name + " is %.1f"%(np.std(e_rewrads)))
def run(): parser = argparse.ArgumentParser() parser.add_argument('--env_id', type=str, default='HalfCheetah-v2') parser.add_argument('--log_name', type=str, default='sac-seed0-datetime') parser.add_argument('--cuda', action='store_true') parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() log_dir = os.path.join('logs', args.env_id, args.log_name) env = gym.make(args.env_id) device = torch.device( "cuda" if args.cuda and torch.cuda.is_available() else "cpu") policy = GaussianPolicy(env.observation_space.shape[0], env.action_space.shape[0], hidden_units=[256, 256]).to(device) policy.load(os.path.join(log_dir, 'model', 'policy.pth')) grad_false(policy) def exploit(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) with torch.no_grad(): _, _, action = policy.sample(state) return action.cpu().numpy().reshape(-1) state = env.reset() episode_reward = 0. done = False while not done: env.render() action = exploit(state) next_state, reward, done, _ = env.step(action) episode_reward += reward state = next_state
import torch from torch.optim import Adam from torch.utils.tensorboard import SummaryWriter from rltorch.memory import MultiStepMemory, PrioritizedMemory from model import TwinnedQNetwork, GaussianPolicy import random import gym from dst_d import DeepSeaTreasure date = 'sac-seed0-20210512-2219' critic = TwinnedQNetwork(2, 2, 2, [256, 256]) critic.load('./logs/dst_d-v0/' + date + '/model/critic.pth') policy = GaussianPolicy(4, 2, [256, 256]) policy.load('./logs/dst_d-v0/' + date + '/model/policy.pth') device = 'cuda' vis = visdom.Visdom() env = gym.make('dst_d-v0') def q_heatmap(action, prefer): prefer = torch.tensor(prefer, dtype=torch.float32) action = torch.tensor(action, dtype=torch.float32) value = np.empty([11, 11]) time = np.empty([11, 11]) for i in range(11):