Exemplo n.º 1
0
def run():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env_id', type=str, default='AntBulletEnv-v0')
    parser.add_argument('--log_name', type=str, default='')
    parser.add_argument('--cuda', action='store_true')
    parser.add_argument('--seed', type=int, default=0)
    args = parser.parse_args()

    if args.log_name:
        log_dir = os.path.join('logs', args.env_id, args.log_name)
    else:
        env_dir = os.path.join('logs', args.env_id, '*')
        dirs = glob.glob(env_dir)
        log_dir = max(dirs, key=os.path.getctime)
        print(f'using {log_dir}')

    env = gym.make(args.env_id)
    device = torch.device(
        "cuda" if args.cuda and torch.cuda.is_available() else "cpu")

    policy = GaussianPolicy(
        env.observation_space.shape[0],
        env.action_space.shape[0],
        hidden_units=[256, 256]).to(device)

    policy.load(os.path.join(log_dir, 'model', 'policy.pth'))
    grad_false(policy)

    def exploit(state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            _, _, action = policy.sample(state)
        return action.cpu().numpy().reshape(-1)

    env.render()
    while True:
        state = env.reset()
        episode_reward = 0.
        done = False
        while not done:
            env.render()
            action = exploit(state)
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward
            state = next_state
        print(f'total reward: {episode_reward}')
        time.sleep(1)
Exemplo n.º 2
0
def testing():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env_name', type=str, default='HalfCheetah-v2')
    parser.add_argument('--num_episode', type=int, default=10)
    args = parser.parse_args()

    num_episode = args.num_episode

    env = gym.make(args.env_name)
    device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")

    policy = GaussianPolicy(
        env.observation_space.shape[0],
        env.action_space.shape[0],
        hidden_units=[256, 256]).to(device)

    policy.load(os.path.join('models', args.env_name, 'policy.pth'))
    grad_false(policy)

    def exploit(state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            _, _, action = policy.sample(state)
        return action.cpu().numpy().reshape(-1)

    e_rewrads = []
    for _ in range(num_episode):
        state = env.reset()
        episode_reward = 0.
        done = False
        while not done:
            if num_episode <= 1:
                env.render()
            action = exploit(state)
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward
            state = next_state
        e_rewrads.append(episode_reward)
    print("Average reward of " + args.env_name + " is %.1f"%(np.mean(e_rewrads)))
    print("Average std of " + args.env_name + " is %.1f"%(np.std(e_rewrads)))
Exemplo n.º 3
0
def run():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env_id', type=str, default='HalfCheetah-v2')
    parser.add_argument('--log_name', type=str, default='sac-seed0-datetime')
    parser.add_argument('--cuda', action='store_true')
    parser.add_argument('--seed', type=int, default=0)
    args = parser.parse_args()

    log_dir = os.path.join('logs', args.env_id, args.log_name)

    env = gym.make(args.env_id)
    device = torch.device(
        "cuda" if args.cuda and torch.cuda.is_available() else "cpu")

    policy = GaussianPolicy(env.observation_space.shape[0],
                            env.action_space.shape[0],
                            hidden_units=[256, 256]).to(device)

    policy.load(os.path.join(log_dir, 'model', 'policy.pth'))
    grad_false(policy)

    def exploit(state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            _, _, action = policy.sample(state)
        return action.cpu().numpy().reshape(-1)

    state = env.reset()
    episode_reward = 0.
    done = False
    while not done:
        env.render()
        action = exploit(state)
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        state = next_state
Exemplo n.º 4
0
    def __init__(self, env, log_dir, num_steps=3000000, batch_size=256,
                 lr=0.0003, hidden_units=[256, 256], memory_size=1e6,
                 gamma=0.99, tau=0.005, entropy_tuning=True, ent_coef=0.2,
                 multi_step=1, per=False, alpha=0.6, beta=0.4,
                 beta_annealing=0.0001, grad_clip=None, updates_per_step=1,
                 start_steps=10000, log_interval=10, target_update_interval=1,
                 eval_interval=1000, cuda=True, seed=0):
        self.env = env

        torch.manual_seed(seed)
        np.random.seed(seed)
        self.env.seed(seed)
        torch.backends.cudnn.deterministic = True  # It harms a performance.
        torch.backends.cudnn.benchmark = False

        self.device = torch.device(
            "cuda" if cuda and torch.cuda.is_available() else "cpu")

        self.policy = GaussianPolicy(
            self.env.observation_space.shape[0],
            self.env.action_space.shape[0],
            hidden_units=hidden_units).to(self.device)
        self.critic = TwinnedQNetwork(
            self.env.observation_space.shape[0],
            self.env.action_space.shape[0],
            hidden_units=hidden_units).to(self.device)
        self.critic_target = TwinnedQNetwork(
            self.env.observation_space.shape[0],
            self.env.action_space.shape[0],
            hidden_units=hidden_units).to(self.device).eval()

        # copy parameters of the learning network to the target network
        hard_update(self.critic_target, self.critic)
        # disable gradient calculations of the target network
        grad_false(self.critic_target)

        self.policy_optim = Adam(self.policy.parameters(), lr=lr)
        self.q1_optim = Adam(self.critic.Q1.parameters(), lr=lr)
        self.q2_optim = Adam(self.critic.Q2.parameters(), lr=lr)

        if entropy_tuning:
            # Target entropy is -|A|.
            self.target_entropy = -torch.prod(torch.Tensor(
                self.env.action_space.shape).to(self.device)).item()
            # We optimize log(alpha), instead of alpha.
            self.log_alpha = torch.zeros(
                1, requires_grad=True, device=self.device)
            self.alpha = self.log_alpha.exp()
            self.alpha_optim = Adam([self.log_alpha], lr=lr)
        else:
            # fixed alpha
            self.alpha = torch.tensor(ent_coef).to(self.device)

        if per:
            # replay memory with prioritied experience replay
            # See https://github.com/ku2482/rltorch/blob/master/rltorch/memory
            self.memory = PrioritizedMemory(
                memory_size, self.env.observation_space.shape,
                self.env.action_space.shape, self.device, gamma, multi_step,
                alpha=alpha, beta=beta, beta_annealing=beta_annealing)
        else:
            # replay memory without prioritied experience replay
            # See https://github.com/ku2482/rltorch/blob/master/rltorch/memory
            self.memory = MultiStepMemory(
                memory_size, self.env.observation_space.shape,
                self.env.action_space.shape, self.device, gamma, multi_step)

        self.log_dir = log_dir
        self.model_dir = os.path.join(log_dir, 'model')
        self.summary_dir = os.path.join(log_dir, 'summary')
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)
        if not os.path.exists(self.summary_dir):
            os.makedirs(self.summary_dir)

        self.writer = SummaryWriter(log_dir=self.summary_dir)
        self.train_rewards = RunningMeanStats(log_interval)

        self.steps = 0
        self.learning_steps = 0
        self.episodes = 0
        self.num_steps = num_steps
        self.tau = tau
        self.per = per
        self.batch_size = batch_size
        self.start_steps = start_steps
        self.gamma_n = gamma ** multi_step
        self.entropy_tuning = entropy_tuning
        self.grad_clip = grad_clip
        self.updates_per_step = updates_per_step
        self.log_interval = log_interval
        self.target_update_interval = target_update_interval
        self.eval_interval = eval_interval
Exemplo n.º 5
0
    def __init__(self,
                 env,
                 log_dir,
                 num_steps=3000000,
                 initial_latent_steps=100000,
                 batch_size=256,
                 latent_batch_size=32,
                 num_sequences=8,
                 lr=0.0003,
                 latent_lr=0.0001,
                 feature_dim=256,
                 latent1_dim=32,
                 latent2_dim=256,
                 hidden_units=[256, 256],
                 memory_size=1e5,
                 gamma=0.99,
                 target_update_interval=1,
                 tau=0.005,
                 entropy_tuning=True,
                 ent_coef=0.2,
                 leaky_slope=0.2,
                 grad_clip=None,
                 updates_per_step=1,
                 start_steps=10000,
                 training_log_interval=10,
                 learning_log_interval=100,
                 eval_interval=50000,
                 cuda=True,
                 seed=0):

        self.env = env
        self.observation_shape = self.env.observation_space.shape
        self.action_shape = self.env.action_space.shape
        self.action_repeat = self.env.action_repeat

        torch.manual_seed(seed)
        np.random.seed(seed)
        self.env.seed(seed)
        # torch.backends.cudnn.deterministic = True  # It harms a performance.
        # torch.backends.cudnn.benchmark = False  # It harms a performance.

        self.device = torch.device(
            "cuda" if cuda and torch.cuda.is_available() else "cpu")

        self.latent = LatentNetwork(self.observation_shape, self.action_shape,
                                    feature_dim, latent1_dim, latent2_dim,
                                    hidden_units, leaky_slope).to(self.device)

        self.policy = GaussianPolicy(
            num_sequences * feature_dim +
            (num_sequences - 1) * self.action_shape[0], self.action_shape[0],
            hidden_units).to(self.device)

        self.critic = TwinnedQNetwork(latent1_dim + latent2_dim,
                                      self.action_shape[0],
                                      hidden_units).to(self.device)
        self.critic_target = TwinnedQNetwork(
            latent1_dim + latent2_dim, self.action_shape[0],
            hidden_units).to(self.device).eval()

        # Copy parameters of the learning network to the target network.
        soft_update(self.critic_target, self.critic, 1.0)
        # Disable gradient calculations of the target network.
        grad_false(self.critic_target)

        # Policy is updated without the encoder.
        self.policy_optim = Adam(self.policy.parameters(), lr=lr)
        self.q_optim = Adam(self.critic.parameters(), lr=lr)
        self.latent_optim = Adam(self.latent.parameters(), lr=latent_lr)

        if entropy_tuning:
            # Target entropy is -|A|.
            self.target_entropy = -self.action_shape[0]
            # We optimize log(alpha) because alpha is always larger than 0.
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = Adam([self.log_alpha], lr=lr)
            self.alpha = self.log_alpha.detach().exp()

        else:
            self.alpha = ent_coef

        self.memory = LazyMemory(memory_size, num_sequences,
                                 self.observation_shape, self.action_shape,
                                 self.device)

        self.log_dir = log_dir
        self.model_dir = os.path.join(log_dir, 'model')
        self.summary_dir = os.path.join(log_dir, 'summary')
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)
        if not os.path.exists(self.summary_dir):
            os.makedirs(self.summary_dir)

        self.writer = SummaryWriter(log_dir=self.summary_dir)
        self.train_rewards = RunningMeanStats(training_log_interval)

        self.steps = 0
        self.learning_steps = 0
        self.episodes = 0
        self.initial_latent_steps = initial_latent_steps
        self.num_sequences = num_sequences
        self.num_steps = num_steps
        self.tau = tau
        self.batch_size = batch_size
        self.latent_batch_size = latent_batch_size
        self.start_steps = start_steps
        self.gamma = gamma
        self.entropy_tuning = entropy_tuning
        self.grad_clip = grad_clip
        self.updates_per_step = updates_per_step
        self.training_log_interval = training_log_interval
        self.learning_log_interval = learning_log_interval
        self.target_update_interval = target_update_interval
        self.eval_interval = eval_interval