Exemplo n.º 1
0
def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(4),
            GridworldEnv(5)], batch_size=128, gamma=0.999, alpha=0.9,
            beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5,
            is_plot=False, num_episodes=200,
            max_num_steps_per_episode=1000, learning_rate=0.001,
            memory_replay_size=10000, memory_policy_size=1000):
    """
    Soft Q-learning training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    # action dimension
    num_actions = list_of_envs[0].action_space.n
    # total envs
    num_envs = len(list_of_envs)
    # pi_0
    policy = PolicyNetwork(num_actions)
    # Q value, every environment has one, used to calculate A_i,
    models = [DQN(num_actions) for _ in range(0, num_envs)]   ### Add torch.nn.ModuleList (?)
    # replay buffer for env ???
    memories = [ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs)]

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    # device = "cpu"
    print(device)
    # model
    policy = policy.to(device)
    for i in range(len(models)):
        models[i] = models[i].to(device)

    # optimizer for every Q model
    optimizers = [optim.Adam(model.parameters(), lr=learning_rate)
                    for model in models]
    # optimizer for policy
    policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    # info list for each environment
    episode_durations = [[] for _ in range(num_envs)]   # list of local steps
    episode_rewards = [[] for _ in range(num_envs)]     # list of list of episode reward

    episodes_done = np.zeros(num_envs)      # episode num
    steps_done = np.zeros(num_envs)         # global timesteps for each env
    current_time = np.zeros(num_envs)       # local timesteps for each env

    # Initialize environments
    for env in list_of_envs:
        env.reset()

    while np.min(episodes_done) < num_episodes:
        policy.train()
        for model in models:
            model.train()

        # TODO: add max_num_steps_per_episode

        # Optimization is given by alterating minimization scheme:
        #   1. do the step for each env
        #   2. do one optimization step for each env using "soft-q-learning".
        #   3. do one optimization step for the policy

        #   1. do the step for each env
        for i_env, env in enumerate(list_of_envs):
            # print("Cur episode:", i_episode, "steps done:", steps_done,
            #         "exploration factor:", eps_end + (eps_start - eps_end) * \
            #         math.exp(-1. * steps_done / eps_decay))
        
            # last_screen = env.current_grid_map
            # ===========update step info begin========================
            current_screen = get_screen(env)
            # state
            state = current_screen # - last_screen
            # action chosen by pi_1~pi_i
            action = select_action(state, policy, models[i_env], num_actions,
                                    eps_start, eps_end, eps_decay,
                                    episodes_done[i_env], alpha, beta, device)
            # global_steps
            steps_done[i_env] += 1
            # local steps
            current_time[i_env] += 1
            # reward
            _, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # next state
            last_screen = current_screen
            current_screen = get_screen(env)
            if not done:
                next_state = current_screen # - last_screen
            else:
                next_state = None

            # add to buffer
            time = Tensor([current_time[i_env]])
            memories[i_env].push(state, action, next_state, reward, time)

            #   2. do one optimization step for each env using "soft-q-learning".
            # Perform one step of the optimization (on the target network)
            optimize_model(policy, models[i_env], optimizers[i_env],
                            memories[i_env], batch_size, alpha, beta, gamma, device)
            # ===========update step info end ========================


            # ===========update episode info begin ====================
            if done:
                print("ENV:", i_env, "iter:", episodes_done[i_env],
                    "\treward:", env.episode_total_reward,
                    "\tit:", current_time[i_env], "\texp_factor:", eps_end +
                    (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay))
                # reset env
                env.reset()
                # episode steps
                episodes_done[i_env] += 1
                # append each episode local timesteps list for every env
                episode_durations[i_env].append(current_time[i_env])
                # reset local timesteps
                current_time[i_env] = 0
                # append total episode_reward to list
                episode_rewards[i_env].append(env.episode_total_reward)
                if is_plot:
                    plot_rewards(episode_rewards, i_env)
            # ===========update episode info end ====================

        #   3. do one optimization step for the policy
        # after all envs has performed one step, optimize policy
        optimize_policy(policy, policy_optimizer, memories, batch_size,
                    num_envs, gamma, device)

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-distral-2col-rewards', episode_rewards)
    np.save(file_name + '-distral-2col-durations', episode_durations)

    return models, policy, episode_rewards, episode_durations
Exemplo n.º 2
0
class SAC_Agent:
    def __init__(self, load_from=None, will_train=True):
        self.env = TorcsEnv(
            path='/usr/local/share/games/torcs/config/raceman/quickrace.xml')
        self.args = SAC_args()
        self.buffer = ReplayBuffer(self.args.buffer_size)

        action_dim = self.env.action_space.shape[0]
        state_dim = self.env.observation_space.shape[0]
        hidden_dim = 256

        self.action_size = action_dim
        self.state_size = state_dim

        self.value_net = ValueNetwork(state_dim,
                                      hidden_dim).to(self.args.device)
        self.target_value_net = ValueNetwork(state_dim,
                                             hidden_dim).to(self.args.device)

        self.soft_q_net1 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)
        self.soft_q_net2 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)

        self.policy_net = PolicyNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)

        self.target_value_net.load_state_dict(self.value_net.state_dict())

        self.value_criterion = nn.MSELoss()
        self.soft_q_loss1 = nn.MSELoss()
        self.soft_q_loss2 = nn.MSELoss()

        self.value_opt = optim.Adam(self.value_net.parameters(),
                                    lr=self.args.lr)
        self.soft_q_opt1 = optim.Adam(self.soft_q_net1.parameters(),
                                      lr=self.args.lr)
        self.soft_q_opt2 = optim.Adam(self.soft_q_net2.parameters(),
                                      lr=self.args.lr)
        self.policy_opt = optim.Adam(self.policy_net.parameters(),
                                     lr=self.args.lr)

        if will_train:
            current_time = time.strftime('%d-%b-%y-%H.%M.%S', time.localtime())
            self.plot_folder = f'plots/{current_time}'
            self.model_save_folder = f'model/{current_time}'
            make_sure_dir_exists(self.plot_folder)
            make_sure_dir_exists(self.model_save_folder)
            self.cp = Checkpoint(self.model_save_folder)

        if load_from is not None:
            try:
                self.load_checkpoint(load_from)
            except FileNotFoundError:
                print(f'{load_from} not found. Running default.')
        else:
            print('Starting from scratch.')

    def train(self):
        remove_log_file()
        clear_action_logs()
        eps_n = 0
        rewards = []
        test_rewards = []
        best_reward = -np.inf
        info = None
        for eps_n in range(1, self.args.max_eps + 1):  # Train loop
            self.set_mode('train')
            relaunch = (eps_n - 1) % (20 / self.args.test_rate) == 0
            state = self.env.reset(relaunch=relaunch,
                                   render=False,
                                   sampletrack=False)
            eps_r = 0
            sigma = (self.args.start_sigma - self.args.end_sigma) * (max(
                0, 1 - (eps_n - 1) / self.args.max_eps)) + self.args.end_sigma
            randomprocess = OrnsteinUhlenbeckProcess(self.args.theta, sigma,
                                                     self.action_size)

            for step in range(self.args.max_eps_time):  # Episode
                action = self.policy_net.get_train_action(state, randomprocess)
                next_state, reward, done, info = self.env.step(action)

                self.buffer.push(state, action, reward, next_state, done)

                state = next_state
                eps_r += reward

                if len(self.buffer) > self.args.batch_size:
                    self.update()

                if done:
                    break

            rewards.append(eps_r)

            test_reward = self.test(eps_n)
            test_rewards.append(test_reward)

            if test_reward > best_reward:
                best_reward = test_reward
                self.save_checkpoint(eps_n, best_reward)

            info_str = ', '.join(
                [key for key in info.keys() if key != 'place'])
            info_str += f", {info['place']}. place"
            log(f'Episode {eps_n:<4} Reward: {eps_r:>7.2f} Test Reward: {test_reward:>7.2f} Info: {info_str}'
                )

            if eps_n % self.args.plot_per == 0:
                self.plot(rewards, test_rewards, eps_n)

    def update(self):
        state, action, reward, next_state, done = self.buffer.sample(
            self.args.batch_size)

        state = FloatTensor(state).to(self.args.device)
        next_state = FloatTensor(next_state).to(self.args.device)
        action = FloatTensor(action).to(self.args.device)
        reward = FloatTensor(reward).unsqueeze(1).to(self.args.device)
        done = FloatTensor(np.float32(done)).unsqueeze(1).to(self.args.device)

        predicted_q_value1 = self.soft_q_net1(state, action)
        predicted_q_value2 = self.soft_q_net2(state, action)
        predicted_value = self.value_net(state)
        new_action, log_prob, epsilon, mean, log_std = self.policy_net.evaluate(
            state)

        # Training Q function
        target_value = self.target_value_net(next_state)
        target_q_value = reward + (1 - done) * self.args.gamma * target_value
        q_value_loss1 = self.soft_q_loss1(predicted_q_value1,
                                          target_q_value.detach())
        q_value_loss2 = self.soft_q_loss2(predicted_q_value2,
                                          target_q_value.detach())

        self.soft_q_opt1.zero_grad()
        q_value_loss1.backward()
        if self.args.clipgrad:
            self.clip_grad(self.soft_q_net1.parameters())
        self.soft_q_opt1.step()
        self.soft_q_opt2.zero_grad()
        q_value_loss2.backward()
        if self.args.clipgrad:
            self.clip_grad(self.soft_q_net2.parameters())
        self.soft_q_opt2.step()

        # Training Value function
        predicted_new_q_value = torch.min(self.soft_q_net1(state, new_action),
                                          self.soft_q_net2(state, new_action))
        target_value_func = predicted_new_q_value - self.args.alpha * log_prob.sum(
        )
        value_loss = self.value_criterion(predicted_value,
                                          target_value_func.detach())

        self.value_opt.zero_grad()
        value_loss.backward()
        if self.args.clipgrad:
            self.clip_grad(self.value_net.parameters())
        self.value_opt.step()

        # Training Policy function
        policy_loss = (log_prob - predicted_new_q_value).mean()

        self.policy_opt.zero_grad()
        policy_loss.backward()
        if self.args.clipgrad:
            self.clip_grad(self.policy_net.parameters())
        self.policy_opt.step()

        # Updating target value network
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(target_param.data *
                                    (1.0 - self.args.soft_tau) +
                                    param.data * self.args.soft_tau)

    def test(self, eps_n):
        self.set_mode('eval')
        rewards = []
        for step in range(self.args.test_rate):
            render = (eps_n % 30 == 0) and (step == 0)
            relaunch = render or ((eps_n % 30 == 0) and (step == 1))
            state = self.env.reset(relaunch=relaunch,
                                   render=render,
                                   sampletrack=False)
            running_reward = 0
            for t in range(self.args.max_eps_time):
                action = self.policy_net.get_test_action(state)
                state, reward, done, info = self.env.step(action)
                store(action, eps_n, reward, info, t == 0)
                running_reward += reward
                if done:
                    break
            rewards.append(running_reward)
        avg_reward = sum(rewards) / self.args.test_rate
        return avg_reward

    def plot(self, rewards, test_rewards, eps_n):
        torch.save({
            'train_rewards': rewards,
            'test_rewards': test_rewards
        }, f'{self.plot_folder}/{eps_n}.pth')
        figure = plt.figure()
        plt.plot(rewards, label='Train Rewards')
        plt.plot(test_rewards, label='Test Rewards')
        plt.xlabel('Episode')
        plt.legend()
        plt.savefig(f'{self.plot_folder}/{eps_n}.png')
        try:
            send_mail(f'Improved Torcs SAC | Episode {eps_n}',
                      f'{self.plot_folder}/{eps_n}.png')
            log('Mail has been sent.')
        except (KeyboardInterrupt, SystemExit):
            print('KeyboardInterrupt or SystemExit')
            raise
        except Exception as e:
            print('Mail Exception occured:', e)
            emsg = e.args[-1]
            emsg = emsg[:1].lower() + emsg[1:]
            log('Couldn\'t send mail because', emsg)

    def clip_grad(self, parameters):
        for param in parameters:
            param.grad.data.clamp_(-1, 1)

    def set_mode(self, mode):
        if mode == 'train':
            self.value_net.train()
            self.target_value_net.train()
            self.soft_q_net1.train()
            self.soft_q_net2.train()
            self.policy_net.train()
        elif mode == 'eval':
            self.value_net.eval()
            self.target_value_net.eval()
            self.soft_q_net1.eval()
            self.soft_q_net2.eval()
            self.policy_net.eval()
        else:
            raise ValueError('mode should be either train or eval')

    def save_checkpoint(self, eps_n, test_reward):
        self.cp.update(self.value_net, self.soft_q_net1, self.soft_q_net2,
                       self.policy_net)
        self.cp.save(f'e{eps_n}-r{test_reward:.4f}.pth')
        log(f'Saved checkpoint at episode {eps_n}.')

    def load_checkpoint(self, load_from):
        state_dicts = torch.load(load_from)
        self.value_net.load_state_dict(state_dicts['best_value'])
        self.soft_q_net1.load_state_dict(state_dicts['best_q1'])
        self.soft_q_net2.load_state_dict(state_dicts['best_q2'])
        self.policy_net.load_state_dict(state_dicts['best_policy'])
        print(f'Loaded from {load_from}.')

    def race(self, sampletrack=True):
        with torch.no_grad():
            state = self.env.reset(relaunch=True,
                                   render=True,
                                   sampletrack=sampletrack)
            running_reward = 0
            done = False
            while not done:
                action = self.policy_net.get_test_action(state)
                state, reward, done, info = self.env.step(action)
                running_reward += reward

            print('Reward:', running_reward)