示例#1
0
def run():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env_id', type=str, default='AntBulletEnv-v0')
    parser.add_argument('--log_name', type=str, default='')
    parser.add_argument('--cuda', action='store_true')
    parser.add_argument('--seed', type=int, default=0)
    args = parser.parse_args()

    if args.log_name:
        log_dir = os.path.join('logs', args.env_id, args.log_name)
    else:
        env_dir = os.path.join('logs', args.env_id, '*')
        dirs = glob.glob(env_dir)
        log_dir = max(dirs, key=os.path.getctime)
        print(f'using {log_dir}')

    env = gym.make(args.env_id)
    device = torch.device(
        "cuda" if args.cuda and torch.cuda.is_available() else "cpu")

    policy = GaussianPolicy(
        env.observation_space.shape[0],
        env.action_space.shape[0],
        hidden_units=[256, 256]).to(device)

    policy.load(os.path.join(log_dir, 'model', 'policy.pth'))
    grad_false(policy)

    def exploit(state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            _, _, action = policy.sample(state)
        return action.cpu().numpy().reshape(-1)

    env.render()
    while True:
        state = env.reset()
        episode_reward = 0.
        done = False
        while not done:
            env.render()
            action = exploit(state)
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward
            state = next_state
        print(f'total reward: {episode_reward}')
        time.sleep(1)
示例#2
0
def testing():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env_name', type=str, default='HalfCheetah-v2')
    parser.add_argument('--num_episode', type=int, default=10)
    args = parser.parse_args()

    num_episode = args.num_episode

    env = gym.make(args.env_name)
    device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")

    policy = GaussianPolicy(
        env.observation_space.shape[0],
        env.action_space.shape[0],
        hidden_units=[256, 256]).to(device)

    policy.load(os.path.join('models', args.env_name, 'policy.pth'))
    grad_false(policy)

    def exploit(state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            _, _, action = policy.sample(state)
        return action.cpu().numpy().reshape(-1)

    e_rewrads = []
    for _ in range(num_episode):
        state = env.reset()
        episode_reward = 0.
        done = False
        while not done:
            if num_episode <= 1:
                env.render()
            action = exploit(state)
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward
            state = next_state
        e_rewrads.append(episode_reward)
    print("Average reward of " + args.env_name + " is %.1f"%(np.mean(e_rewrads)))
    print("Average std of " + args.env_name + " is %.1f"%(np.std(e_rewrads)))
def run():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env_id', type=str, default='HalfCheetah-v2')
    parser.add_argument('--log_name', type=str, default='sac-seed0-datetime')
    parser.add_argument('--cuda', action='store_true')
    parser.add_argument('--seed', type=int, default=0)
    args = parser.parse_args()

    log_dir = os.path.join('logs', args.env_id, args.log_name)

    env = gym.make(args.env_id)
    device = torch.device(
        "cuda" if args.cuda and torch.cuda.is_available() else "cpu")

    policy = GaussianPolicy(env.observation_space.shape[0],
                            env.action_space.shape[0],
                            hidden_units=[256, 256]).to(device)

    policy.load(os.path.join(log_dir, 'model', 'policy.pth'))
    grad_false(policy)

    def exploit(state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            _, _, action = policy.sample(state)
        return action.cpu().numpy().reshape(-1)

    state = env.reset()
    episode_reward = 0.
    done = False
    while not done:
        env.render()
        action = exploit(state)
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        state = next_state
示例#4
0
import torch
from torch.optim import Adam
from torch.utils.tensorboard import SummaryWriter
from rltorch.memory import MultiStepMemory, PrioritizedMemory

from model import TwinnedQNetwork, GaussianPolicy
import random
import gym
from dst_d import DeepSeaTreasure

date = 'sac-seed0-20210512-2219'
critic = TwinnedQNetwork(2, 2, 2, [256, 256])
critic.load('./logs/dst_d-v0/' + date + '/model/critic.pth')

policy = GaussianPolicy(4, 2, [256, 256])
policy.load('./logs/dst_d-v0/' + date + '/model/policy.pth')

device = 'cuda'

vis = visdom.Visdom()
env = gym.make('dst_d-v0')


def q_heatmap(action, prefer):
    prefer = torch.tensor(prefer, dtype=torch.float32)
    action = torch.tensor(action, dtype=torch.float32)

    value = np.empty([11, 11])
    time = np.empty([11, 11])

    for i in range(11):