Exemplo n.º 1
0
class RLAgent(Player):
    def __init__(self,
                 name,
                 others=None,
                 last_n=10,
                 load_path=None,
                 checkpoint=5000,
                 fixed_strategy=False,
                 eps_decay=0.00005):
        if others is None:
            others = [1, 2]
        self.others = others
        self.last_n = last_n
        self.prev_points = 0
        self.batch_size = 32
        self.gamma = 0.9
        self.eps_start = 1
        self.eps_end = 0.01
        self.eps_decay = eps_decay
        self.target_update = 100
        self.plot_at = 1000
        self.q_max = []
        self.q_list = []
        self.checkpoint = checkpoint
        self.memory_size = 1000
        self.lr = 0.00001
        self.train = True

        self.input_dim = len(others) * 6
        self.output_dim = 3
        self.current_step = 1
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.memory = ReplayMemory(self.memory_size)

        # Initialize the policy and target networks
        self.policy_net = DQN(self.input_dim, self.output_dim).to(self.device)
        self.target_net = DQN(self.input_dim, self.output_dim).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        if load_path is not None:
            checkpoint = torch.load(load_path)
            self.policy_net.load_state_dict(checkpoint['model_state_dict'])
            self.policy_net.eval()
            self.eps_start = 0
            self.eps_end = 0
            self.train = False
        if fixed_strategy:
            self.strategy = FixedStrategy()
        self.strategy = EpsilonGreedyStrategy(self.eps_start, self.eps_end,
                                              self.eps_decay)

        # Set the optimizer
        self.optimizer = optim.Adam(params=self.policy_net.parameters(),
                                    lr=self.lr)
        self.loss = None

        # Push to replay memory
        self.prev_state = None
        self.action = None
        self.reward = None
        self.current_state = None

        super().__init__(name)

    def select_action(self, valid_actions, history):
        # print(self.memory.can_provide_sample(self.batch_size))
        if self.memory.can_provide_sample(self.batch_size) and self.train:
            self.train_model()

        if len(history) > self.last_n + 1:
            self.prev_state, self.current_state = self.get_states(history)
            self.reward = self.get_reward()
            if self.action is not None and self.train:
                self.memory.push(
                    Experience(self.prev_state, self.action,
                               self.current_state, self.reward))
            self.action = self.get_action(valid_actions)
            return self.action.item()
        else:
            return np.random.choice(valid_actions)

    def get_states(self, history):
        prev_state, current_state = [], []
        if len(history) > self.last_n + 1:
            for other in self.others:
                other_history = [i[other] for i in history]
                other_last_n = other_history[-self.last_n:]
                other_last_n_p = other_history[-self.last_n - 1:-1]
                other_policy_total = get_policy(other_history)
                other_policy_last_n = get_policy(other_last_n)
                other_policy_total_p = get_policy(other_history[:-1])
                other_policy_last_n_p = get_policy(other_last_n_p)
                prev_state.extend(other_policy_total_p + other_policy_last_n_p)
                current_state.extend(other_policy_total + other_policy_last_n)
        return torch.as_tensor(prev_state).unsqueeze(-2), torch.as_tensor(
            current_state).unsqueeze(-2)

    def get_reward(self):
        reward = self.points - self.prev_points
        self.prev_points = self.points
        return torch.tensor([reward])

    def get_action(self, valid_actions):
        rate = self.strategy.get_exploration_rate(self.current_step)
        self.current_step += 1
        if rate > random.random():
            # For random, we can pass the allowable_moves vector and choose from it randomly
            action = np.random.choice(valid_actions)
            return torch.tensor([action]).to(self.device)  # explore
        else:
            with torch.no_grad():
                self.q_max.append(
                    self.policy_net(self.current_state).max().item())
                return self.policy_net(self.current_state).max(1)[1].to(
                    self.device)  # exploit

    def train_model(self):
        experiences = self.memory.sample(self.batch_size)
        states, actions, rewards, next_states = extract_tensors(experiences)
        if self.current_step % self.target_update == 0:
            print('UPDATE TARGET NET', self.current_step)
            self.q_list.extend(self.q_max)
            print('Q Max', sum(self.q_max) / self.target_update)
            q_max_list.append(sum(self.q_max) / self.target_update)
            self.q_max = []
            self.target_net.load_state_dict(self.policy_net.state_dict())

        if self.current_step % self.plot_at == 0:
            e_ = self.memory.memory[-100:]
            batch = Experience(*zip(*e_))
            print('\n', '*' * 42)
            print('EXPLORATION RATE',
                  self.strategy.get_exploration_rate(self.current_step))
            print('REWARD', sum(batch.reward).item())
            print('POLICY', get_policy([i.item() for i in batch.action]))
            print('*' * 42, '\n')
            plt.plot(range(len(q_max_list)), q_max_list)
            plt.show()
        if self.current_step % self.checkpoint == 0:
            print('SAVE CHECKPOINT AT', self.current_step)
            checkpoint_path = checkpoint_folder + checkpoint_prefix + str(
                self.current_step) + checkpoint_suffix
            torch.save({'model_state_dict': self.policy_net.state_dict()},
                       checkpoint_path)
        current_q_values = QValues.get_current(self.policy_net, states,
                                               actions)
        next_q_values = QValues.get_next(self.policy_net, self.target_net,
                                         next_states)
        target_q_values = (next_q_values * self.gamma) + rewards
        self.loss = F.mse_loss(current_q_values, target_q_values)
        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()
Exemplo n.º 2
0
class Agent:
    def __init__(self,
                 env,
                 input_size,
                 output_size,
                 hidden_size,
                 max_cars=10,
                 max_passengers=10,
                 mix_hidden=32,
                 batch_size=128,
                 lr=0.001,
                 gamma=.999,
                 eps_start=0.9,
                 eps_end=0.05,
                 eps_decay=750,
                 replay_capacity=10000,
                 num_save=200,
                 num_episodes=10000,
                 mode="random",
                 training=False,
                 load_file=None):
        self.env = env
        self.orig_env = copy.deepcopy(env)
        self.grid_map = env.grid_map
        self.cars = env.grid_map.cars
        self.num_cars = len(self.cars)
        self.passengers = env.grid_map.passengers
        self.num_passengers = len(self.passengers)
        self.max_cars = max_cars
        self.max_passengers = max_passengers
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.eps_start = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay
        self.replay_capacity = replay_capacity
        self.num_episodes = num_episodes
        self.steps_done = 0
        self.lr = lr
        self.mode = mode
        self.num_save = num_save
        self.training = training
        self.algorithm = PairAlgorithm()
        self.episode_durations = []
        self.duration_matrix = np.zeros((self.max_passengers, self.max_cars))
        self.count_matrix = np.zeros((self.max_passengers, self.max_cars))
        self.loss_history = []

        self.memory = ReplayMemory(self.replay_capacity)

        self.device = torch.device("cpu")
        print("Device being used:", self.device)
        self.policy_net = DQN(self.input_size, self.output_size,
                              self.hidden_size).to(self.device)

        self.params = list(self.policy_net.parameters())

        if self.mode == "qmix":
            self.mixer = QMixer(self.input_size, self.max_passengers,
                                mix_hidden).to(self.device)
            self.params += list(self.mixer.parameters())

        if load_file:
            self.policy_net.load_state_dict(torch.load(load_file))
            if self.mode == "qmix":
                self.mixer.load_state_dict(torch.load("mixer_" + load_file))
                self.mixer.eval()
            self.policy_net.eval()
            self.load_file = "Pretrained_" + load_file
            print("Checkpoint loaded")
        else:
            self.load_file = self.mode + "_model_num_cars_" + str(self.num_cars) + "_num_passengers_" + str(self.num_passengers) + \
                    "_num_episodes_" + str(self.num_episodes) + "_hidden_size_" + str(self.hidden_size) + ".pth"

        self.optimizer = optim.RMSprop(self.params, lr=self.lr)
        #self.optimizer = optim.Adam(self.params, lr=self.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
        #self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 1500, gamma=0.1)

    def select_action(self, state):
        #Select action with epsilon greedy
        sample = random.random()

        eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * \
            math.exp(-1. * self.steps_done / self.eps_decay)

        print(eps_threshold)

        self.steps_done += 1

        if not self.training:
            eps_threshold = 0.0

        if sample > eps_threshold:
            # Choose best action
            with torch.no_grad():

                self.policy_net.eval()
                action = self.policy_net(state).view(
                    self.max_passengers,
                    self.max_cars)[:, :self.num_cars].max(1)[1].view(
                        1, self.max_passengers)
                action[0, self.num_passengers:] = self.max_cars
                return action

        else:
            #Choose random action
            action = torch.tensor([[
                random.randrange(self.num_cars)
                for car in range(self.max_passengers)
            ]],
                                  device=self.device,
                                  dtype=torch.long)
            action[0, self.num_passengers:] = self.max_cars
            return action

    def random_action(self, state):
        return torch.tensor([[
            random.randrange(self.num_cars)
            for car in range(self.num_passengers)
        ]],
                            device=self.device,
                            dtype=torch.long)

    def get_state(self):
        # Cars (px, py, 1=matched), Passengers(pickup_x, pickup_y, dest_x, dest_y, 1=matched)
        # Vector Size = 3*C + 5*P
        cars = self.cars
        passengers = self.passengers
        indicator_cars_vec = np.zeros(self.max_cars)
        indicator_passengers_vec = np.zeros(self.max_passengers)

        # Encode information about cars
        cars_vec = np.array([0] * (2 * self.max_cars))

        for i, car in enumerate(cars):
            cars_vec[2 * i:2 * i + 2] = [car.position[0], car.position[1]]
            indicator_cars_vec[i] = 1

        # Encode information about passengers
        passengers_vec = np.array([0] * (4 * self.max_passengers))
        for i, passenger in enumerate(passengers):
            passengers_vec[4 * i:4 * i + 4] = [
                passenger.pick_up_point[0], passenger.pick_up_point[1],
                passenger.drop_off_point[0], passenger.drop_off_point[1]
            ]
            indicator_passengers_vec[i] = 1

        return torch.tensor(np.concatenate(
            (cars_vec, indicator_cars_vec, passengers_vec,
             indicator_passengers_vec)),
                            device=self.device,
                            dtype=torch.float).unsqueeze(0)

    def train(self):

        duration_sum = 0.0

        for episode in range(self.num_episodes):

            self.reset_different_num()
            #self.reset()
            #self.reset_orig_env()

            state = self.get_state()

            if self.mode == "dqn" or self.mode == "qmix":
                action = self.select_action(state)

            elif self.mode == "random":
                action = self.random_action([state])

            elif self.mode == "greedy":
                action = [self.algorithm.greedy_fcfs(self.grid_map)]
                action = torch.tensor(action,
                                      device=self.device,
                                      dtype=torch.long)
                #print(action.size())
                #print(action[:,:self.num_passengers])

            reward, duration = self.env.step(action[:, :self.num_passengers],
                                             self.mode)

            if self.mode == "dqn":
                reward.extend([0] *
                              (self.max_passengers - self.num_passengers))

            self.episode_durations.append(duration)
            count = self.count_matrix[self.num_passengers - 1,
                                      self.num_cars - 1]
            self.duration_matrix[
                self.num_passengers - 1,
                self.num_cars - 1] = self.duration_matrix[
                    self.num_passengers - 1, self.num_cars -
                    1] * (count / (count + 1)) + duration / (count + 1)
            self.count_matrix[self.num_passengers - 1, self.num_cars - 1] += 1
            duration_sum += duration

            if self.training:
                self.memory.push(
                    state, action,
                    torch.tensor(reward, device=self.device,
                                 dtype=torch.float).unsqueeze(0))
                self.optimize_model()

                self.plot_durations(self.mode)
                self.plot_loss_history(self.mode)

            if self.training and episode % self.num_save == 0:
                torch.save(self.policy_net.state_dict(),
                           "episode_" + str(episode) + "_" + self.load_file)
                if self.mode == "qmix":
                    torch.save(
                        self.mixer.state_dict(),
                        "mixer_episode_" + str(episode) + "_" + self.load_file)
                print("Checkpoint saved")

            print("Episode: ", episode)

        if self.training:
            torch.save(self.policy_net.state_dict(), self.load_file)
            if self.mode == "qmix":
                torch.save(self.mixer.state_dict(), "mixer_" + self.load_file)
            print("Checkpoint saved")

        print("Average duration was ", duration_sum / self.num_episodes)
        print("Finished")
        np.save("Duration_matrix", self.duration_matrix)
        np.save("Count_matrix", self.count_matrix)
        print(self.duration_matrix)
        print(self.count_matrix)

    def reset(self):

        self.env.reset()
        self.grid_map = self.env.grid_map
        self.cars = self.env.grid_map.cars
        self.passengers = self.env.grid_map.passengers

    def reset_different_num(self):

        self.env.grid_map.cars = []
        self.env.grid_map.passengers = []
        self.env.grid_map.num_passengers = random.randint(
            1, self.max_passengers)
        self.env.grid_map.num_cars = random.randint(1, self.max_cars)
        self.env.grid_map.add_passenger(self.env.grid_map.num_passengers)
        self.env.grid_map.add_cars(self.env.grid_map.num_cars)

        self.grid_map = self.env.grid_map
        self.num_passengers = self.env.grid_map.num_passengers
        self.num_cars = self.env.grid_map.num_cars
        self.cars = self.env.grid_map.cars
        self.passengers = self.env.grid_map.passengers

    def reset_orig_env(self):

        self.env = copy.deepcopy(self.orig_env)
        self.grid_map = self.env.grid_map
        self.cars = self.env.grid_map.cars
        self.passengers = self.env.grid_map.passengers
        self.grid_map.init_zero_map_cost()

    def optimize_model(self):
        if len(self.memory) < self.batch_size:
            return

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        self.policy_net.train()

        q_values = self.policy_net(state_batch).view(self.batch_size,
                                                     self.max_passengers,
                                                     self.max_cars)
        q_values = torch.cat((q_values,
                              torch.zeros(
                                  (self.batch_size, self.max_passengers, 1),
                                  device=self.device)), 2)
        state_action_values = q_values.gather(
            2, action_batch.unsqueeze(2)).squeeze()

        # Compute the expected Q values
        expected_state_action_values = reward_batch

        # Compute Huber loss
        if self.mode == "dqn":
            loss = F.smooth_l1_loss(state_action_values,
                                    expected_state_action_values)
        elif self.mode == "qmix":
            self.mixer.train()
            chosen_action_qvals = self.mixer(state_action_values, state_batch)
            loss = F.smooth_l1_loss(chosen_action_qvals,
                                    reward_batch.view(-1, 1, 1))
            #loss = F.mse_loss(chosen_action_qvals, reward_batch.view(-1, 1, 1))

        self.loss_history.append(loss.item())

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()

        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def plot_durations(self, filename):
        print("Saving durations plot ...")
        plt.figure(2)
        plt.clf()

        total_steps = np.array(self.episode_durations)

        N = len(total_steps)
        window_size = 200
        if N < window_size:
            total_steps_smoothed = total_steps
        else:
            total_steps_smoothed = np.zeros(N - window_size)

            for i in range(N - window_size):
                window_steps = total_steps[i:i + window_size]
                total_steps_smoothed[i] = np.average(window_steps)

        plt.title('Episode Duration history')
        plt.xlabel('Episode')
        plt.ylabel('Duration')

        plt.plot(total_steps_smoothed)
        np.save("Duration_" + filename, total_steps_smoothed)
        #plt.savefig("Durations_history_" + filename)

    def plot_loss_history(self, filename):
        print("Saving loss history ...")
        plt.figure(2)
        plt.clf()
        #loss = torch.tensor(self.loss_history, dtype=torch.float)

        total_loss = np.array(self.loss_history)

        N = len(total_loss)
        window_size = 50
        if N < window_size:
            total_loss_smoothed = total_loss
        else:
            total_loss_smoothed = np.zeros(N - window_size)

            for i in range(N - window_size):
                window_steps = total_loss[i:i + window_size]
                total_loss_smoothed[i] = np.average(window_steps)

        plt.title('Loss history')
        plt.xlabel('Episodes')
        plt.ylabel('Loss')
        plt.plot(self.loss_history)
        np.save("Loss_" + filename, total_loss_smoothed)
Exemplo n.º 3
0
class Agent(AbstractAgent):
    actions = ['←', '→', '↑', '↓']

    def __init__(self, env, p=1.0, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100,
                 memory_capacity=100, batch_size=10, target_update=10, eps=0.5, eps_decay=0.999):
        AbstractAgent.__init__(self, eps, eps_decay)
        self.env = env
        self.lr = lr
        self.y = y
        self.step_cost = step_cost
        self.living_cost = living_cost
        q = (1.0 - p) / 2
        self.stochastic_actions = {
            '←': [[0, 2, 3], [p, q, q]],
            '→': [[1, 2, 3], [p, q, q]],
            '↑': [[2, 0, 1], [p, q, q]],
            '↓': [[3, 0, 1], [p, q, q]]
        }
        self.s0 = env.field.index('s')
        self.episode_length = episode_length
        self.rewards = []
        self.losses = []
        self.state_len = env.width * env.height
        self.nn = Model(
            in_features=self.state_len,
            hidden=[],
            out_features=len(Agent.actions))
        self.target_nn = Model(
            in_features=self.state_len,
            hidden=[],
            out_features=len(Agent.actions))
        self.target_nn.load_state_dict(self.nn.state_dict())
        self.target_nn.eval()

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.05)
        self.memory = ReplayMemory(memory_capacity)
        self.batch_size = batch_size
        self.target_update = target_update

    def step(self, state, action):
        # simulating Markov Process, desired action happens with probability p
        # but with the probability (1-p) / 2 the agent goes sideways
        sa = self.stochastic_actions[action]
        mp_action = np.random.choice(sa[0], p=sa[1])
        action = Agent.actions[mp_action]
        return self.env.step(state, action)

    def print_policy(self):
        for y in range(self.env.height):
            for x in range(self.env.width):
                s = y * self.env.width + x
                cell = self.env.field[s]
                if not (cell == '.' or cell == 's'):
                    print(cell, end='')
                    continue
                q_predicted = self._predict_q_policy(s)
                a = torch.argmax(q_predicted, 0).item()
                print(Agent.actions[a], end='')
            print()

    def _encode_state(self, s):
        z = np.zeros(self.state_len)
        z[s] = 1
        return torch.tensor(z, dtype=torch.float)

    def _predict_q_policy(self, s):
        return self.nn(self._encode_state(s))

    def _predict_q_target(self, s):
        return self.target_nn(self._encode_state(s))

    def optimize(self):
        if len(self.memory) < self.batch_size:
            return

        transitions = self.memory.sample(self.batch_size)
        for s, a, s1, r in transitions:
            q_predicted = self._predict_q_policy(s)
            q_target = q_predicted.clone().detach()
            q_target[a] = r + self.y * self._predict_q_target(s1).max().item()

            loss = self.criterion(q_predicted, q_target)
            self.losses.append(loss.item())

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

    def run_episode(self):
        AbstractAgent.run_episode(self)
        s = self.s0
        episode_number = len(self.rewards)
        self.rewards.append(.0)
        for j in range(self.episode_length):
            q_predicted = self._predict_q_policy(s)
            a = torch.argmax(q_predicted, 0).item()
            a = self.select_action(a)
            s1, r, over = self.step(s, Agent.actions[a])
            if s != s1:
                r -= self.step_cost
            r -= self.living_cost
            self.memory.push(s, a, s1, r)
            s = s1
            self.optimize()
            self.rewards[-1] += r
            if over:
                break
        if episode_number % self.target_update == 0:
            self.target_nn.load_state_dict(self.nn.state_dict())
Exemplo n.º 4
0
            # Act in the true environment.
            #print(env)
            #old_obs = obs
            obs, reward, done, info = env.step(action.item() +
                                               ENV_CONFIGS[args.env]['offset'])
            # Preprocess incoming observation.
            if not done:
                obs = preprocess(obs, envID=args.env, env=env).unsqueeze(0)
                next_obs_stack = torch.cat(
                    (obs_stack[:, 1:, ...], obs.unsqueeze(1)),
                    dim=1).to(device)
            else:
                next_obs_stack = None

            #action = action - ENV_CONFIGS[args.env]['offset']
            memory.push(obs_stack, action, next_obs_stack, reward)
            obs_stack = next_obs_stack

            # TODO: Add the transition to the replay memory. Remember to convert
            #       everything to PyTorch tensors!

            # TODO: Run DQN.optimize() every env_config["train_frequency"] steps.
            # TODO: Update the target network every env_config["target_update_frequency"] steps.

            if (count % env_config['train_frequency'] == 0):
                loss = optimize(dqn, target_dqn, memory, optimizer)
            if (count % env_config['target_update_frequency'] == 0):
                target_dqn.load_state_dict(dqn.state_dict())
            count += 1
        # Evaluate the current agent.
        if episode % args.evaluate_freq == 0:
Exemplo n.º 5
0
        current_screen = get_screen(env, device)
        diff = current_screen - last_screen
        state = torch.cat((current_screen, last_screen, diff), dim=1)

        for t in count():
            action = select_action(state)
            _, reward, done, _ = env.step(action)
            reward = torch.tensor([reward], device=device)

            last_screen = current_screen
            current_screen = get_screen(env, device)
            diff = current_screen - last_screen
            if not done:
                next_state = torch.cat((current_screen, last_screen, diff),
                                       dim=1)
            else:
                next_state = None

            memory.push(state, action, next_state, reward)
            state = next_state

            optimize_model()
            if done:
                print('duration:', t + 1)
                break

        if i_episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())

    env.close()
Exemplo n.º 6
0
            # Act in the true environment.
            next_obs, reward, done, _ = env.step(action.item())

            reward = preprocess(reward, env=args.env)

            step_number += 1

            # Preprocess incoming observation.
            if done:
                next_obs = None
            else:
                next_obs = preprocess(next_obs, env=args.env)

            # Add the transition to the replay memory
            memory.push(obs, action, next_obs, reward)

            obs = next_obs

            # Run DQN.optimize() every env_config["train_frequency"] steps.
            if step_number % env_config['train_frequency'] == 0:
                optimize(dqn, target_dqn, memory, optimizer)

            # Update the target network every env_config["target_update_frequency"] steps.
            if step_number % env_config['target_update_frequency'] == 0:
                target_dqn.load_state_dict(dqn.state_dict())

        # Evaluate the current agent.
        if episode % args.evaluate_freq == 0:
            mean_return = evaluate_policy(dqn,
                                          env,
Exemplo n.º 7
0
class Agent(AbstractAgent):
    actions = ['←', '→', '↑', '↓']

    def __init__(self,
                 env,
                 lr=0.8,
                 y=0.95,
                 step_cost=.0,
                 living_cost=.0,
                 episode_length=100,
                 memory_capacity=100,
                 batch_size=25,
                 eps=0.5,
                 eps_decay=0.999):
        AbstractAgent.__init__(self, eps, eps_decay)
        self.env = env
        self.lr = lr
        self.y = y
        self.step_cost = step_cost
        self.living_cost = living_cost
        self.s0 = env.field.index('s')
        self.episode_length = episode_length
        self.rewards = []
        self.losses = []
        self.state_len = env.width * env.height

        self.nn = Model(in_features=2,
                        hidden=[self.state_len, self.state_len],
                        out_features=len(Agent.actions))

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.01)
        self.memory = ReplayMemory(memory_capacity)
        self.batch_size = batch_size

    def step(self, state, action):
        return self.env.step(state, action)

    def print_policy(self):
        for y in range(self.env.height):
            for x in range(self.env.width):
                s = y * self.env.width + x
                cell = self.env.field[s]
                if not (cell == '.' or cell == 's'):
                    print(cell, end='')
                    continue
                q_predicted = self._predict_q(s)
                a = torch.argmax(q_predicted, 0).item()
                print(Agent.actions[a], end='')
            print()

    def _encode_state(self, s):
        # z = np.zeros(self.state_len)
        # z[s] = 1
        # return torch.tensor(z, dtype=torch.float)
        w = self.env.width
        x, y = s % w, s // w
        return torch.tensor([x, y], dtype=torch.float)

    def _predict_q(self, s):
        return self.nn(self._encode_state(s))

    def optimize(self):
        if len(self.memory) < self.batch_size:
            return

        transitions = self.memory.sample(self.batch_size)
        for s, a, s1, r in transitions:
            q_predicted = self._predict_q(s)
            q_target = q_predicted.clone().detach()
            q_target[a] = r + self.y * self._predict_q(s1).max().item()

            loss = self.criterion(q_predicted, q_target)
            self.losses.append(loss)

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

    def run_episode(self):
        AbstractAgent.run_episode(self)
        s = self.s0
        self.rewards.append(.0)
        for j in range(self.episode_length):
            q_predicted = self._predict_q(s)
            a = torch.argmax(q_predicted, 0).item()
            a = self.select_action(a)
            s1, r, over = self.step(s, Agent.actions[a])
            if s != s1:
                r -= self.step_cost
            r -= self.living_cost
            self.memory.push(s, a, s1, r)
            s = s1
            self.optimize()
            self.rewards[-1] += r
            if over:
                break
Exemplo n.º 8
0
class Agent(AbstractAgent):
    actions = ['←', '→', '↑', '↓']

    def __init__(self,
                 env,
                 model,
                 lr=0.8,
                 y=0.95,
                 step_cost=.0,
                 living_cost=.0,
                 episode_length=100,
                 memory_capacity=100,
                 batch_size=10,
                 eps=0.5,
                 eps_decay=0.999):
        AbstractAgent.__init__(self, eps, eps_decay)
        self.env = env
        self.model = model
        self.lr = lr
        self.y = y
        self.step_cost = step_cost
        self.living_cost = living_cost
        self.s0 = env.field.index('s')
        self.episode_length = episode_length
        self.rewards = []
        self.losses = []
        self.memory = ReplayMemory(memory_capacity)
        self.batch_size = batch_size

    def step(self, state, action):
        return self.env.step(state, action)

    def print_policy(self):
        for y in range(self.env.height):
            for x in range(self.env.width):
                s = y * self.env.width + x
                cell = self.env.field[s]
                if not (cell == '.' or cell == 's'):
                    print(cell, end='')
                    continue
                q_predicted = self.predict_q(s)
                a = np.argmax(q_predicted)
                print(Agent.actions[a], end='')
            print()

    def _encode_state(self, s):
        z = np.zeros(self.env.length)
        z[s] = 1.0
        return np.array([z])

    def predict_q(self, s):
        return self.model.predict(self._encode_state(s))[0]

    def optimize(self):
        if len(self.memory) < self.batch_size:
            return

        transitions = self.memory.sample(self.batch_size)
        for s, a, s1, r in transitions:
            q_predicted = self.predict_q(s)
            q_target = q_predicted
            q_target[a] = r + self.y * self.predict_q(s1).max()

            history = self.model.fit(x=self._encode_state(s),
                                     y=np.array([q_target]),
                                     epochs=1,
                                     verbose=False)
            self.losses.append(history.history["loss"][-1])

    def run_episode(self):
        AbstractAgent.run_episode(self)
        s = self.s0
        self.rewards.append(.0)
        for j in range(self.episode_length):
            q_predicted = self.predict_q(s)
            a = np.argmax(q_predicted)
            a = self.select_action(a)
            s1, r, over = self.step(s, Agent.actions[a])
            if s != s1:
                r -= self.step_cost
            r -= self.living_cost
            self.memory.push(s, a, s1, r)
            s = s1
            self.optimize()
            self.rewards[-1] += r
            if over:
                break
Exemplo n.º 9
0
def train(args):
    device = torch.device("cuda" if args.gpu else "cpu")
    env = Environment(draw=False,
                      fps=args.fps,
                      debug=args.debug,
                      dist_to_pipe=args.dist_to_pipe,
                      dist_between_pipes=args.dist_between_pipes,
                      obs_this_pipe=args.obs_this_pipe)

    observation_space = env.get_observation_size_buffer()
    action_space = env.get_action_size()

    policy_network = DQN(observation_space, action_space).to(device)
    target_network = DQN(observation_space, action_space).to(device)

    optimizer = torch.optim.Adam(policy_network.parameters(), lr=args.lr)

    replay_buffer = ReplayMemory(args.replay_capacity)
    writer = SummaryWriter()

    if args.inference:
        target_network.load_checkpoint()

    best_reward = None
    iteration = 0
    total_reward = 0.0
    rewards = []
    state = env.reset()
    while True:
        epsilon = max(args.final_eps,
                      args.start_eps - iteration / args.eps_decay_final_step)

        iteration += 1
        episode_reward = None
        if np.random.rand() < epsilon:
            action = env.get_action_random()
        else:
            state_v = torch.tensor(np.array([state], copy=False)).to(device)
            q_vals_v = policy_network(state_v.float())
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        next_state, reward, done = env.step(action)
        total_reward += reward

        replay_buffer.push(state, action, next_state, reward, done)

        state = next_state

        if done:
            episode_reward = total_reward
            state = env.reset()
            total_reward = 0.0

        if episode_reward is not None:
            rewards.append(episode_reward)
            mean_reward = np.mean(rewards[-80:])
            print(
                f"Episode {iteration}:  eps {epsilon}  mean reward {mean_reward}  episode reward {episode_reward}"
            )

            writer.add_scalar("epsilon", epsilon, iteration)
            writer.add_scalar("mean_reward", mean_reward, iteration)
            writer.add_scalar("reward", episode_reward, iteration)

            if best_reward is None or best_reward < mean_reward:
                torch.save(policy_network.state_dict(),
                           f"./models/checkpoint_{iteration}")
                print(f"New best reward found: {best_reward} -> {mean_reward}")
                best_reward = mean_reward
            if mean_reward > args.goal_reward:
                print(f"Achieved in {iteration} steps.")
                break

        if len(replay_buffer) < args.replay_start_step:
            continue

        if iteration % args.target_update_iterations == 0:
            target_network.load_state_dict(policy_network.state_dict())

        optimizer.zero_grad()

        batch = replay_buffer.sample(args.batch_size)
        loss = calculate_loss(batch,
                              policy_network,
                              target_network,
                              args.gamma,
                              device=device)

        loss.backward()
        optimizer.step()
    writer.close()