示例#1
0
    def __init__(self,
                 env,
                 lr=0.8,
                 y=0.95,
                 step_cost=.0,
                 living_cost=.0,
                 episode_length=100,
                 memory_capacity=100,
                 batch_size=25,
                 eps=0.5,
                 eps_decay=0.999):
        AbstractAgent.__init__(self, eps, eps_decay)
        self.env = env
        self.lr = lr
        self.y = y
        self.step_cost = step_cost
        self.living_cost = living_cost
        self.s0 = env.field.index('s')
        self.episode_length = episode_length
        self.rewards = []
        self.losses = []
        self.state_len = env.width * env.height

        self.nn = Model(in_features=2,
                        hidden=[self.state_len, self.state_len],
                        out_features=len(Agent.actions))

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.01)
        self.memory = ReplayMemory(memory_capacity)
        self.batch_size = batch_size
示例#2
0
 def __init__(self,
              env,
              model,
              lr=0.8,
              y=0.95,
              step_cost=.0,
              living_cost=.0,
              episode_length=100,
              memory_capacity=100,
              batch_size=10,
              eps=0.5,
              eps_decay=0.999):
     AbstractAgent.__init__(self, eps, eps_decay)
     self.env = env
     self.model = model
     self.lr = lr
     self.y = y
     self.step_cost = step_cost
     self.living_cost = living_cost
     self.s0 = env.field.index('s')
     self.episode_length = episode_length
     self.rewards = []
     self.losses = []
     self.memory = ReplayMemory(memory_capacity)
     self.batch_size = batch_size
示例#3
0
    def __init__(self, inputs, n_actions):
        self.brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions)
        self.target_brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions)
        self.target_brain.load_state_dict(self.brain.state_dict())
        self.target_brain.eval()

        self.set_params()
        self.optimizer = torch.optim.Adam(self.brain.parameters())
        self.memory = ReplayMemory(50000)
        self.action_space = [0, 1]
示例#4
0
    def __init__(self, env, input_size, output_size, hidden_size, mix_hidden = 32, batch_size = 128, lr = 0.001, gamma = .999, eps_start = 0.9, 
                 eps_end = 0.05, eps_decay = 750,  replay_capacity = 10000, num_save = 200, num_episodes = 10000, mode="random", training = False, load_file = None):
        self.env = env
        self.orig_env = copy.deepcopy(env)
        self.grid_map = env.grid_map
        self.cars = env.grid_map.cars
        self.num_cars = len(self.cars)
        self.passengers = env.grid_map.passengers
        self.num_passengers = len(self.passengers)
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.eps_start = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay
        self.replay_capacity = replay_capacity
        self.num_episodes = num_episodes
        self.steps_done = 0
        self.lr = lr
        self.mode = mode
        self.num_save = num_save
        self.training = training
        self.algorithm = PairAlgorithm()
        self.episode_durations = []
        self.loss_history = []
        
        self.memory = ReplayMemory(self.replay_capacity)
        
        self.device = torch.device("cpu")#"cuda:0" if torch.cuda.is_available() else 
        print("Device being used:", self.device)
        self.policy_net = DQN(self.input_size, self.output_size , self.hidden_size).to(self.device)
        
        self.params = list(self.policy_net.parameters())

        
        if self.mode == "qmix":
            self.mixer = QMixer(self.input_size, self.num_passengers, mix_hidden).to(self.device)
            self.params += list(self.mixer.parameters())
            
        
        if load_file:
            self.policy_net.load_state_dict(torch.load(load_file))
            self.policy_net.eval()
            if self.mode == "qmix":
                self.mixer.load_state_dict(torch.load("mixer_" + load_file))
                self.mixer.eval()
            self.load_file = "Trained_" + load_file
            print("Checkpoint loaded")
        else:         
            self.load_file = self.mode + "_model_num_cars_" + str(self.num_cars) + "_num_passengers_" + str(self.num_passengers) + \
                    "_num_episodes_" + str(self.num_episodes) + "_hidden_size_" + str(self.hidden_size) + ".pth"
            
        self.optimizer = optim.RMSprop(self.params, lr = self.lr)
    def __init__(self):
        # self.config = config
        self.gamma = 0.4

        # self.logger = logging.getLogger("DQNAgent")

        self.screen_width = 600

        # define models (policy and target)
        self.policy_model = DQN()
        self.target_model = DQN()

        # define memory
        self.memory = ReplayMemory()

        # define loss
        self.loss = HuberLoss()

        # define optimizer
        self.optim = torch.optim.Adam(self.policy_model.parameters(), lr=0.01)

        # define environment
        self.env = PyCar()  #TODO
        # self.cartpole = PyCar(self.screen_width)

        # initialize counter
        self.current_episode = 0
        self.current_iteration = 0
        self.episode_durations = []

        self.batch_size = 250

        # set cuda flag
        self.is_cuda = torch.cuda.is_available()

        self.cuda = self.is_cuda

        if self.cuda:
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.policy_model = self.policy_model.to(self.device)
        self.target_model = self.target_model.to(self.device)
        self.loss = self.loss.to(self.device)

        # Initialize Target model with policy model state dict
        self.target_model.load_state_dict(self.policy_model.state_dict())
        self.target_model.eval()

        self.savepath = "/home/sk002/Documents/RL-Project/model/"
示例#6
0
    def testReplayMemory(self):
        od = [84, 84, 4]
        ad = [8, 10]
        rd = [5]
        s = int(10000)
        b = 32

        rm = ReplayMemory(obs_dim=od, act_dim=ad, r_dim=rd, size=s)
        o = self.get_rand(od)
        a = self.get_rand(ad)
        r = self.get_rand(rd)
        d = 0
        for _ in range(1000):
            rm.store(o, a, r, o, d)

        o_s, a_s, r_s, on_s, d_s = rm.sample(b)

        self.assertEqual(o_s.shape, combined_shape(b, od))
        self.assertEqual(a_s.shape, combined_shape(b, ad))
        self.assertEqual(r_s.shape, combined_shape(b, rd))
        self.assertEqual(on_s.shape, combined_shape(b, od))
        self.assertEqual(d_s.shape, combined_shape(b))
示例#7
0
    def __init__(self, env, p=1.0, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100,
                 memory_capacity=100, batch_size=10, target_update=10, eps=0.5, eps_decay=0.999):
        AbstractAgent.__init__(self, eps, eps_decay)
        self.env = env
        self.lr = lr
        self.y = y
        self.step_cost = step_cost
        self.living_cost = living_cost
        q = (1.0 - p) / 2
        self.stochastic_actions = {
            '←': [[0, 2, 3], [p, q, q]],
            '→': [[1, 2, 3], [p, q, q]],
            '↑': [[2, 0, 1], [p, q, q]],
            '↓': [[3, 0, 1], [p, q, q]]
        }
        self.s0 = env.field.index('s')
        self.episode_length = episode_length
        self.rewards = []
        self.losses = []
        self.state_len = env.width * env.height
        self.nn = Model(
            in_features=self.state_len,
            hidden=[],
            out_features=len(Agent.actions))
        self.target_nn = Model(
            in_features=self.state_len,
            hidden=[],
            out_features=len(Agent.actions))
        self.target_nn.load_state_dict(self.nn.state_dict())
        self.target_nn.eval()

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.05)
        self.memory = ReplayMemory(memory_capacity)
        self.batch_size = batch_size
        self.target_update = target_update
示例#8
0
class RLAgent(Player):
    def __init__(self,
                 name,
                 others=None,
                 last_n=10,
                 load_path=None,
                 checkpoint=5000,
                 fixed_strategy=False,
                 eps_decay=0.00005):
        if others is None:
            others = [1, 2]
        self.others = others
        self.last_n = last_n
        self.prev_points = 0
        self.batch_size = 32
        self.gamma = 0.9
        self.eps_start = 1
        self.eps_end = 0.01
        self.eps_decay = eps_decay
        self.target_update = 100
        self.plot_at = 1000
        self.q_max = []
        self.q_list = []
        self.checkpoint = checkpoint
        self.memory_size = 1000
        self.lr = 0.00001
        self.train = True

        self.input_dim = len(others) * 6
        self.output_dim = 3
        self.current_step = 1
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.memory = ReplayMemory(self.memory_size)

        # Initialize the policy and target networks
        self.policy_net = DQN(self.input_dim, self.output_dim).to(self.device)
        self.target_net = DQN(self.input_dim, self.output_dim).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        if load_path is not None:
            checkpoint = torch.load(load_path)
            self.policy_net.load_state_dict(checkpoint['model_state_dict'])
            self.policy_net.eval()
            self.eps_start = 0
            self.eps_end = 0
            self.train = False
        if fixed_strategy:
            self.strategy = FixedStrategy()
        self.strategy = EpsilonGreedyStrategy(self.eps_start, self.eps_end,
                                              self.eps_decay)

        # Set the optimizer
        self.optimizer = optim.Adam(params=self.policy_net.parameters(),
                                    lr=self.lr)
        self.loss = None

        # Push to replay memory
        self.prev_state = None
        self.action = None
        self.reward = None
        self.current_state = None

        super().__init__(name)

    def select_action(self, valid_actions, history):
        # print(self.memory.can_provide_sample(self.batch_size))
        if self.memory.can_provide_sample(self.batch_size) and self.train:
            self.train_model()

        if len(history) > self.last_n + 1:
            self.prev_state, self.current_state = self.get_states(history)
            self.reward = self.get_reward()
            if self.action is not None and self.train:
                self.memory.push(
                    Experience(self.prev_state, self.action,
                               self.current_state, self.reward))
            self.action = self.get_action(valid_actions)
            return self.action.item()
        else:
            return np.random.choice(valid_actions)

    def get_states(self, history):
        prev_state, current_state = [], []
        if len(history) > self.last_n + 1:
            for other in self.others:
                other_history = [i[other] for i in history]
                other_last_n = other_history[-self.last_n:]
                other_last_n_p = other_history[-self.last_n - 1:-1]
                other_policy_total = get_policy(other_history)
                other_policy_last_n = get_policy(other_last_n)
                other_policy_total_p = get_policy(other_history[:-1])
                other_policy_last_n_p = get_policy(other_last_n_p)
                prev_state.extend(other_policy_total_p + other_policy_last_n_p)
                current_state.extend(other_policy_total + other_policy_last_n)
        return torch.as_tensor(prev_state).unsqueeze(-2), torch.as_tensor(
            current_state).unsqueeze(-2)

    def get_reward(self):
        reward = self.points - self.prev_points
        self.prev_points = self.points
        return torch.tensor([reward])

    def get_action(self, valid_actions):
        rate = self.strategy.get_exploration_rate(self.current_step)
        self.current_step += 1
        if rate > random.random():
            # For random, we can pass the allowable_moves vector and choose from it randomly
            action = np.random.choice(valid_actions)
            return torch.tensor([action]).to(self.device)  # explore
        else:
            with torch.no_grad():
                self.q_max.append(
                    self.policy_net(self.current_state).max().item())
                return self.policy_net(self.current_state).max(1)[1].to(
                    self.device)  # exploit

    def train_model(self):
        experiences = self.memory.sample(self.batch_size)
        states, actions, rewards, next_states = extract_tensors(experiences)
        if self.current_step % self.target_update == 0:
            print('UPDATE TARGET NET', self.current_step)
            self.q_list.extend(self.q_max)
            print('Q Max', sum(self.q_max) / self.target_update)
            q_max_list.append(sum(self.q_max) / self.target_update)
            self.q_max = []
            self.target_net.load_state_dict(self.policy_net.state_dict())

        if self.current_step % self.plot_at == 0:
            e_ = self.memory.memory[-100:]
            batch = Experience(*zip(*e_))
            print('\n', '*' * 42)
            print('EXPLORATION RATE',
                  self.strategy.get_exploration_rate(self.current_step))
            print('REWARD', sum(batch.reward).item())
            print('POLICY', get_policy([i.item() for i in batch.action]))
            print('*' * 42, '\n')
            plt.plot(range(len(q_max_list)), q_max_list)
            plt.show()
        if self.current_step % self.checkpoint == 0:
            print('SAVE CHECKPOINT AT', self.current_step)
            checkpoint_path = checkpoint_folder + checkpoint_prefix + str(
                self.current_step) + checkpoint_suffix
            torch.save({'model_state_dict': self.policy_net.state_dict()},
                       checkpoint_path)
        current_q_values = QValues.get_current(self.policy_net, states,
                                               actions)
        next_q_values = QValues.get_next(self.policy_net, self.target_net,
                                         next_states)
        target_q_values = (next_q_values * self.gamma) + rewards
        self.loss = F.mse_loss(current_q_values, target_q_values)
        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()
示例#9
0
    # Initialize environment and config.
    env = gym.make(args.env)

    env_config = ENV_CONFIGS[args.env]
    env = gym.wrappers.AtariPreprocessing(env,
                                          screen_size=84,
                                          grayscale_obs=True,
                                          frame_skip=1,
                                          noop_max=30,
                                          scale_obs=True)
    # Initialize deep Q-networks.
    dqn = DQN(env_config=env_config).to(device)
    # TODO: Create and initialize target Q-network.
    target_dqn = DQN(env_config=env_config).to(device)
    # Create replay memory.
    memory = ReplayMemory(env_config['memory_size'])

    # Initialize optimizer used for training the DQN. We use Adam rather than RMSProp.
    optimizer = torch.optim.Adam(dqn.parameters(), lr=env_config['lr'])

    # Keep track of best evaluation mean return achieved so far.
    best_mean_return = -float("Inf")
    for episode in range(env_config['n_episodes']):
        done = False
        obs = preprocess(env.reset(), envID=args.env, env=env).unsqueeze(0)
        obs_stack = torch.cat(env_config['obs_stack_size'] *
                              [obs]).unsqueeze(0).to(device)
        count = 0
        while not done:
            # TODO: Get action from DQN.
            action = dqn.act(obs_stack)
示例#10
0
    def __init__(self):
        # self.config = config
        self.gamma = 0.75

        # self.logger = logging.getLogger("DQNAgent")

        self.screen_width = 600

        # define models (policy and target)
        self.policy_model = DQN()
        self.target_model = DQN()

        # define memory
        self.memory = ReplayMemory()

        # define loss
        self.loss = HuberLoss()

        # define optimizer
        self.optim = torch.optim.Adam(self.policy_model.parameters(),
                                      lr=0.0001)

        # define environment
        self.env = PyCar()  #TODO
        # self.cartpole = PyCar(self.screen_width)

        # initialize counter
        self.current_episode = 0
        self.current_iteration = 0
        self.episode_durations = []

        self.batch_size = 250

        # set cuda flag
        self.is_cuda = torch.cuda.is_available()

        self.cuda = self.is_cuda

        if self.cuda:
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.policy_model = self.policy_model.to(self.device)
        self.target_model = self.target_model.to(self.device)
        self.loss = self.loss.to(self.device)

        # Initialize Target model with policy model state dict
        self.target_model.load_state_dict(self.policy_model.state_dict())
        self.target_model.eval()

        self.savepath = os.path.join(os.getcwd(), "model") + "/"
        if not os.path.isdir(self.savepath):
            os.makedirs(self.savepath)

        t = time.localtime()
        self.save_tensorboard_path = os.path.join(
            os.getcwd(), "tensorboard_record") + "/run_" + time.strftime(
                "%d_%m_%Y_%H_%M", t) + "/"
        if not os.path.isdir(self.savepath):
            os.makedirs(self.savepath)
        self.writer = SummaryWriter(self.save_tensorboard_path)
示例#11
0
class DQNAgent:
    def __init__(self):
        # self.config = config
        self.gamma = 0.75

        # self.logger = logging.getLogger("DQNAgent")

        self.screen_width = 600

        # define models (policy and target)
        self.policy_model = DQN()
        self.target_model = DQN()

        # define memory
        self.memory = ReplayMemory()

        # define loss
        self.loss = HuberLoss()

        # define optimizer
        self.optim = torch.optim.Adam(self.policy_model.parameters(),
                                      lr=0.0001)

        # define environment
        self.env = PyCar()  #TODO
        # self.cartpole = PyCar(self.screen_width)

        # initialize counter
        self.current_episode = 0
        self.current_iteration = 0
        self.episode_durations = []

        self.batch_size = 250

        # set cuda flag
        self.is_cuda = torch.cuda.is_available()

        self.cuda = self.is_cuda

        if self.cuda:
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.policy_model = self.policy_model.to(self.device)
        self.target_model = self.target_model.to(self.device)
        self.loss = self.loss.to(self.device)

        # Initialize Target model with policy model state dict
        self.target_model.load_state_dict(self.policy_model.state_dict())
        self.target_model.eval()

        self.savepath = os.path.join(os.getcwd(), "model") + "/"
        if not os.path.isdir(self.savepath):
            os.makedirs(self.savepath)

        t = time.localtime()
        self.save_tensorboard_path = os.path.join(
            os.getcwd(), "tensorboard_record") + "/run_" + time.strftime(
                "%d_%m_%Y_%H_%M", t) + "/"
        if not os.path.isdir(self.savepath):
            os.makedirs(self.savepath)
        self.writer = SummaryWriter(self.save_tensorboard_path)

    def run(self):
        """
        This function will the operator
        :return:
        """
        try:
            self.train()

        except KeyboardInterrupt as e:
            print(e)

    def select_action(self, state, random_only=False):
        """
        The action selection function, it either uses the model to choose an action or samples one uniformly.
        :param state: current state of the model
        :return:
        """

        self.eps_start = 0.90
        self.eps_end = 0.35
        self.eps_decay = 500

        if self.cuda:
            state = state.cuda()
        sample = random.random()
        eps_threshold = self.eps_start - (
            self.eps_start - self.eps_end) * math.exp(
                -1. * self.current_iteration / self.eps_decay)

        self.writer.add_scalar('epsilon', eps_threshold,
                               self.current_iteration)
        # print("Eps thresh: ", eps_threshold)
        if sample < eps_threshold and not random_only:
            # print("Model step")
            with torch.no_grad():
                return self.policy_model(state).max(1)[1].view(1,
                                                               1)  # size (1,1)
        else:
            # print("Random step")
            return torch.tensor([[random.randrange(5)]],
                                device=self.device,
                                dtype=torch.long)

    def get_action(self, state):

        if self.cuda:
            state = state.cuda()
        with torch.no_grad():
            return self.policy_model(state).max(1)[1].view(1, 1)  # size (1,1)

    def optimize_policy_model(self):
        """
        performs a single step of optimization for the policy model
        :return:
        """
        if self.memory.length() < self.batch_size:
            return

        self.memory.setup_epoch_training()

        total_loss = None
        training_len = math.ceil(self.memory.length() / self.batch_size)
        for i in range(training_len):
            # sample a batch
            transitions = self.memory.sample_batch(self.batch_size, i)
            len_transitions = len(transitions)

            one_batch = Transition(*zip(*transitions))

            non_final_mask = torch.tensor(tuple(
                map(lambda s: s is not None, one_batch.next_state)),
                                          device=self.device,
                                          dtype=torch.uint8)
            non_final_next_states = torch.cat(
                [s for s in one_batch.next_state if s is not None])

            state_batch = torch.cat(one_batch.state)
            action_batch = torch.cat(one_batch.action)
            reward_batch = torch.cat(one_batch.reward)

            state_batch = state_batch.to(self.device)
            non_final_next_states = non_final_next_states.to(self.device)

            curr_state_values = self.policy_model(state_batch)  # [128, 2]
            curr_state_action_values = curr_state_values.gather(
                1, action_batch)  # [128, 1]

            next_state_values = torch.zeros(len_transitions,
                                            device=self.device)  # [128]
            next_state_values[non_final_mask] = self.target_model(
                non_final_next_states).max(1)[0].detach()  # [< 128]

            # Get the expected Q values
            expected_state_action_values = (next_state_values *
                                            self.gamma) + reward_batch  # [128]
            # compute loss: temporal difference error
            loss = self.loss(curr_state_action_values,
                             expected_state_action_values.unsqueeze(1))

            if total_loss is None:
                total_loss = loss
            else:
                total_loss += loss

            # optimizer step
            self.optim.zero_grad()
            loss.backward()
            for param in self.policy_model.parameters():
                param.grad.data.clamp_(-1, 1)
            self.optim.step()

        self.writer.add_scalar('loss', total_loss / training_len,
                               self.current_iteration)
        # return loss

    def train(self):
        """
        Training loop based on the number of episodes
        :return:
        """

        self.num_episodes = 2000
        self.target_update = 1

        mean_score, max_score, min_score = self.run_sim(100, random_only=True)

        self.writer.add_scalar('mean_score', mean_score, 0)
        self.writer.add_scalar('max_score', max_score, 0)
        self.writer.add_scalar('min_score', min_score, 0)

        for episode in tqdm(range(self.current_episode, self.num_episodes)):
            self.current_iteration += 1
            self.current_episode = episode
            # reset environment
            self.train_one_epoch()
            # The target network has its weights kept frozen most of the time
            if self.current_episode % self.target_update == 0:
                self.target_model.load_state_dict(
                    self.policy_model.state_dict())

            if self.current_episode % 25 == 0:
                torch.save(
                    self.policy_model.state_dict(), self.savepath +
                    "policy_epoch" + str(self.current_episode) + ".pth")
                torch.save(
                    self.target_model.state_dict(), self.savepath +
                    "target_epoch" + str(self.current_episode) + ".pth")

    def run_sim(self, count=20, random_only=False):
        score_list = []
        for i in range(count):
            self.env.reset_game()
            episode_duration = 0

            curr_state = torch.Tensor(self.env.get_state()).permute(
                2, 0, 1).unsqueeze(0)

            while (1):
                # time.sleep(0.1)
                episode_duration += 1

                # select action
                action = self.select_action(curr_state, random_only)

                images, reward, done, score = self.env.step(
                    action.item())  #TODO

                if self.cuda:
                    reward = torch.Tensor([reward]).to(self.device)
                else:
                    reward = torch.Tensor([reward]).to(self.device)

                # assign next state
                if done:
                    next_state = None
                else:
                    next_state = torch.Tensor(images).permute(2,
                                                              0, 1).unsqueeze(
                                                                  0)  #TODO

                # add this transition into memory
                self.memory.push_transition(curr_state, action, next_state,
                                            reward)

                curr_state = next_state

                if done:
                    score_list.append(score)
                    break

        return np.mean(np.array(score_list)), np.max(
            np.array(score_list)), np.min(np.array(score_list))

    def train_one_epoch(self):
        """
        One episode of training; it samples an action, observe next screen and optimize the model once
        :return:
        """

        mean_score, max_score, min_score = self.run_sim()
        # print(mean_score)
        self.writer.add_scalar('mean_score', mean_score,
                               self.current_iteration)
        self.writer.add_scalar('max_score', max_score, self.current_iteration)
        self.writer.add_scalar('min_score', min_score, self.current_iteration)

        # Policy model optimization step
        self.optimize_policy_model()

    def validate(self):

        curr_state = torch.Tensor(self.env.get_state()).permute(2, 0,
                                                                1).unsqueeze(0)

        while (1):
            # time.sleep(0.1)

            episode_duration += 1
            # select action
            action = self.get_action(curr_state)

            images, reward, done, score = self.env.step(action.item())  #TODO

            if self.cuda:
                reward = torch.Tensor([reward]).to(self.device)
            else:
                reward = torch.Tensor([reward]).to(self.device)

            # assign next state
            if done:
                next_state = None
            else:
                next_state = torch.Tensor(images).permute(2, 0, 1).unsqueeze(
                    0)  #TODO

            curr_state = next_state

            if done:
                print(score)
                break
示例#12
0
class Agent(AbstractAgent):
    actions = ['←', '→', '↑', '↓']

    def __init__(self,
                 env,
                 lr=0.8,
                 y=0.95,
                 step_cost=.0,
                 living_cost=.0,
                 episode_length=100,
                 memory_capacity=100,
                 batch_size=25,
                 eps=0.5,
                 eps_decay=0.999):
        AbstractAgent.__init__(self, eps, eps_decay)
        self.env = env
        self.lr = lr
        self.y = y
        self.step_cost = step_cost
        self.living_cost = living_cost
        self.s0 = env.field.index('s')
        self.episode_length = episode_length
        self.rewards = []
        self.losses = []
        self.state_len = env.width * env.height

        self.nn = Model(in_features=2,
                        hidden=[self.state_len, self.state_len],
                        out_features=len(Agent.actions))

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.01)
        self.memory = ReplayMemory(memory_capacity)
        self.batch_size = batch_size

    def step(self, state, action):
        return self.env.step(state, action)

    def print_policy(self):
        for y in range(self.env.height):
            for x in range(self.env.width):
                s = y * self.env.width + x
                cell = self.env.field[s]
                if not (cell == '.' or cell == 's'):
                    print(cell, end='')
                    continue
                q_predicted = self._predict_q(s)
                a = torch.argmax(q_predicted, 0).item()
                print(Agent.actions[a], end='')
            print()

    def _encode_state(self, s):
        # z = np.zeros(self.state_len)
        # z[s] = 1
        # return torch.tensor(z, dtype=torch.float)
        w = self.env.width
        x, y = s % w, s // w
        return torch.tensor([x, y], dtype=torch.float)

    def _predict_q(self, s):
        return self.nn(self._encode_state(s))

    def optimize(self):
        if len(self.memory) < self.batch_size:
            return

        transitions = self.memory.sample(self.batch_size)
        for s, a, s1, r in transitions:
            q_predicted = self._predict_q(s)
            q_target = q_predicted.clone().detach()
            q_target[a] = r + self.y * self._predict_q(s1).max().item()

            loss = self.criterion(q_predicted, q_target)
            self.losses.append(loss)

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

    def run_episode(self):
        AbstractAgent.run_episode(self)
        s = self.s0
        self.rewards.append(.0)
        for j in range(self.episode_length):
            q_predicted = self._predict_q(s)
            a = torch.argmax(q_predicted, 0).item()
            a = self.select_action(a)
            s1, r, over = self.step(s, Agent.actions[a])
            if s != s1:
                r -= self.step_cost
            r -= self.living_cost
            self.memory.push(s, a, s1, r)
            s = s1
            self.optimize()
            self.rewards[-1] += r
            if over:
                break
示例#13
0
class DQNAgent:
    def __init__(self, inputs, n_actions):
        self.brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions)
        self.target_brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions)
        self.target_brain.load_state_dict(self.brain.state_dict())
        self.target_brain.eval()

        self.set_params()
        self.optimizer = torch.optim.Adam(self.brain.parameters())
        self.memory = ReplayMemory(50000)
        self.action_space = [0, 1]

    def set_params(self):
        self.batch_size = 64

        self.max_exploration_rate = 1
        self.min_exploration_rate = 0.05
        self.exploration_decay_rate = 0.0005

        self.steps_done = 0

    def select_action(self, state):
        sample = np.random.random()
        exploration_rate = self.min_exploration_rate + (
            self.max_exploration_rate - self.min_exploration_rate) * np.exp(
                -self.steps_done * self.exploration_decay_rate)

        self.steps_done += 1
        if sample > exploration_rate:
            with torch.no_grad():
                actions = self.brain(state)
                return torch.argmax(actions).item()
        else:
            return np.random.choice(self.action_space)

    def learn(self):
        if len(self.memory) < self.batch_size:
            return

        self.optimizer.zero_grad()

        max_capacity = (len(self.memory)
                        if len(self.memory) < self.memory.capacity else
                        self.memory.capacity)

        batch = np.random.choice(max_capacity, self.batch_size)

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        non_final_mask = torch.tensor(
            tuple(map(lambda s: s is not None, batch.next_state)),
            dtype=torch.bool,
        )
        non_final_next_states = torch.tensor(
            [s for s in batch.next_state if s is not None])

        state_batch = torch.tensor(batch.state)
        action_batch = torch.tensor(batch.action)
        reward_batch = torch.tensor(batch.reward, dtype=torch.float)

        state_action_values = self.brain(state_batch).gather(
            1, action_batch.unsqueeze(-1))

        next_state_values = torch.zeros(self.batch_size)
        next_state_values[non_final_mask] = self.target_brain(
            non_final_next_states).max(1)[0]

        gamma = 0.99
        expected_state_action_values = (gamma * next_state_values +
                                        reward_batch / reward_batch.max())

        self.loss = torch.nn.MSELoss()(
            expected_state_action_values.unsqueeze(-1), state_action_values)

        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()
示例#14
0
class Agent(AbstractAgent):
    actions = ['←', '→', '↑', '↓']

    def __init__(self,
                 env,
                 model,
                 lr=0.8,
                 y=0.95,
                 step_cost=.0,
                 living_cost=.0,
                 episode_length=100,
                 memory_capacity=100,
                 batch_size=10,
                 eps=0.5,
                 eps_decay=0.999):
        AbstractAgent.__init__(self, eps, eps_decay)
        self.env = env
        self.model = model
        self.lr = lr
        self.y = y
        self.step_cost = step_cost
        self.living_cost = living_cost
        self.s0 = env.field.index('s')
        self.episode_length = episode_length
        self.rewards = []
        self.losses = []
        self.memory = ReplayMemory(memory_capacity)
        self.batch_size = batch_size

    def step(self, state, action):
        return self.env.step(state, action)

    def print_policy(self):
        for y in range(self.env.height):
            for x in range(self.env.width):
                s = y * self.env.width + x
                cell = self.env.field[s]
                if not (cell == '.' or cell == 's'):
                    print(cell, end='')
                    continue
                q_predicted = self.predict_q(s)
                a = np.argmax(q_predicted)
                print(Agent.actions[a], end='')
            print()

    def _encode_state(self, s):
        z = np.zeros(self.env.length)
        z[s] = 1.0
        return np.array([z])

    def predict_q(self, s):
        return self.model.predict(self._encode_state(s))[0]

    def optimize(self):
        if len(self.memory) < self.batch_size:
            return

        transitions = self.memory.sample(self.batch_size)
        for s, a, s1, r in transitions:
            q_predicted = self.predict_q(s)
            q_target = q_predicted
            q_target[a] = r + self.y * self.predict_q(s1).max()

            history = self.model.fit(x=self._encode_state(s),
                                     y=np.array([q_target]),
                                     epochs=1,
                                     verbose=False)
            self.losses.append(history.history["loss"][-1])

    def run_episode(self):
        AbstractAgent.run_episode(self)
        s = self.s0
        self.rewards.append(.0)
        for j in range(self.episode_length):
            q_predicted = self.predict_q(s)
            a = np.argmax(q_predicted)
            a = self.select_action(a)
            s1, r, over = self.step(s, Agent.actions[a])
            if s != s1:
                r -= self.step_cost
            r -= self.living_cost
            self.memory.push(s, a, s1, r)
            s = s1
            self.optimize()
            self.rewards[-1] += r
            if over:
                break
示例#15
0
class Agent(AbstractAgent):
    actions = ['←', '→', '↑', '↓']

    def __init__(self, env, p=1.0, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100,
                 memory_capacity=100, batch_size=10, target_update=10, eps=0.5, eps_decay=0.999):
        AbstractAgent.__init__(self, eps, eps_decay)
        self.env = env
        self.lr = lr
        self.y = y
        self.step_cost = step_cost
        self.living_cost = living_cost
        q = (1.0 - p) / 2
        self.stochastic_actions = {
            '←': [[0, 2, 3], [p, q, q]],
            '→': [[1, 2, 3], [p, q, q]],
            '↑': [[2, 0, 1], [p, q, q]],
            '↓': [[3, 0, 1], [p, q, q]]
        }
        self.s0 = env.field.index('s')
        self.episode_length = episode_length
        self.rewards = []
        self.losses = []
        self.state_len = env.width * env.height
        self.nn = Model(
            in_features=self.state_len,
            hidden=[],
            out_features=len(Agent.actions))
        self.target_nn = Model(
            in_features=self.state_len,
            hidden=[],
            out_features=len(Agent.actions))
        self.target_nn.load_state_dict(self.nn.state_dict())
        self.target_nn.eval()

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.05)
        self.memory = ReplayMemory(memory_capacity)
        self.batch_size = batch_size
        self.target_update = target_update

    def step(self, state, action):
        # simulating Markov Process, desired action happens with probability p
        # but with the probability (1-p) / 2 the agent goes sideways
        sa = self.stochastic_actions[action]
        mp_action = np.random.choice(sa[0], p=sa[1])
        action = Agent.actions[mp_action]
        return self.env.step(state, action)

    def print_policy(self):
        for y in range(self.env.height):
            for x in range(self.env.width):
                s = y * self.env.width + x
                cell = self.env.field[s]
                if not (cell == '.' or cell == 's'):
                    print(cell, end='')
                    continue
                q_predicted = self._predict_q_policy(s)
                a = torch.argmax(q_predicted, 0).item()
                print(Agent.actions[a], end='')
            print()

    def _encode_state(self, s):
        z = np.zeros(self.state_len)
        z[s] = 1
        return torch.tensor(z, dtype=torch.float)

    def _predict_q_policy(self, s):
        return self.nn(self._encode_state(s))

    def _predict_q_target(self, s):
        return self.target_nn(self._encode_state(s))

    def optimize(self):
        if len(self.memory) < self.batch_size:
            return

        transitions = self.memory.sample(self.batch_size)
        for s, a, s1, r in transitions:
            q_predicted = self._predict_q_policy(s)
            q_target = q_predicted.clone().detach()
            q_target[a] = r + self.y * self._predict_q_target(s1).max().item()

            loss = self.criterion(q_predicted, q_target)
            self.losses.append(loss.item())

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

    def run_episode(self):
        AbstractAgent.run_episode(self)
        s = self.s0
        episode_number = len(self.rewards)
        self.rewards.append(.0)
        for j in range(self.episode_length):
            q_predicted = self._predict_q_policy(s)
            a = torch.argmax(q_predicted, 0).item()
            a = self.select_action(a)
            s1, r, over = self.step(s, Agent.actions[a])
            if s != s1:
                r -= self.step_cost
            r -= self.living_cost
            self.memory.push(s, a, s1, r)
            s = s1
            self.optimize()
            self.rewards[-1] += r
            if over:
                break
        if episode_number % self.target_update == 0:
            self.target_nn.load_state_dict(self.nn.state_dict())
示例#16
0
    def __init__(self,
                 name,
                 others=None,
                 last_n=10,
                 load_path=None,
                 checkpoint=5000,
                 fixed_strategy=False,
                 eps_decay=0.00005):
        if others is None:
            others = [1, 2]
        self.others = others
        self.last_n = last_n
        self.prev_points = 0
        self.batch_size = 32
        self.gamma = 0.9
        self.eps_start = 1
        self.eps_end = 0.01
        self.eps_decay = eps_decay
        self.target_update = 100
        self.plot_at = 1000
        self.q_max = []
        self.q_list = []
        self.checkpoint = checkpoint
        self.memory_size = 1000
        self.lr = 0.00001
        self.train = True

        self.input_dim = len(others) * 6
        self.output_dim = 3
        self.current_step = 1
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.memory = ReplayMemory(self.memory_size)

        # Initialize the policy and target networks
        self.policy_net = DQN(self.input_dim, self.output_dim).to(self.device)
        self.target_net = DQN(self.input_dim, self.output_dim).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        if load_path is not None:
            checkpoint = torch.load(load_path)
            self.policy_net.load_state_dict(checkpoint['model_state_dict'])
            self.policy_net.eval()
            self.eps_start = 0
            self.eps_end = 0
            self.train = False
        if fixed_strategy:
            self.strategy = FixedStrategy()
        self.strategy = EpsilonGreedyStrategy(self.eps_start, self.eps_end,
                                              self.eps_decay)

        # Set the optimizer
        self.optimizer = optim.Adam(params=self.policy_net.parameters(),
                                    lr=self.lr)
        self.loss = None

        # Push to replay memory
        self.prev_state = None
        self.action = None
        self.reward = None
        self.current_state = None

        super().__init__(name)
示例#17
0
                            dtype=torch.long)


if __name__ == "__main__":
    BATCH_SIZE = 128
    GAMMA = 0.999
    EPS_START = 0.9
    EPS_END = 0.05
    EPS_DECAY = 200
    TARGET_UPDATE = 10
    MAX_T = 9999
    steps_done = 0
    timer = Timer()
    rect = util.get_screen_rect()
    region = (rect[0], rect[1], rect[2] - rect[0], rect[3] - rect[1])
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    monitor = Monitor(device, region)
    env = gym.make("Game-v0")
    init_screen = monitor.get_screen(pytorch=True)
    _, _, height, width = init_screen.shape

    n_actions = env.action_space.n
    policy_net = DQN(width, height, n_actions).to(device)
    target_net = DQN(width, height, n_actions).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = torch.optim.RMSprop(policy_net.parameters())
    memory = ReplayMemory(3000)
    simulate()
    ACTION_BUILD_BARRACKS,
    ACTION_ATTACK,
    ACTION_SELECT_BARRACKS,
    ACTION_BUILD_MARINE,


]

KILL_UNIT_REWARD = 0.2
KILL_BUILDING_REWARD = 0.5

reward_check = []

model = DQN(6, 8)
optimizer = optim.RMSprop(model.parameters(), 1e-3)
memory = ReplayMemory(10000)


class DQNAgent(base_agent.BaseAgent):
    def __init__(self):
        super(DQNAgent, self).__init__()
        self.previous_state = None
        self.previous_action = None
        self.model = model
        self.memory = memory
        self.optimizer = optimizer
        self.diagnostics = [0, 0, 0, 0, 0, 0, 0, 0]

        self.base_top_left = None
        self.supply_depot_built = False
        self.scv_selected = False
示例#19
0
outdir = 'results/selfx-billard'
env = wrappers.Monitor(env, directory=outdir, force=True)
env.seed(0)
env.reset()

init_screen = get_screen(env, device)
_, _, screen_height, screen_width = init_screen.shape
n_actions = len(env.action_space)

policy_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

steps_done = 0
memory = ReplayMemory(10000)
optimizer = optim.RMSprop(policy_net.parameters())


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            expected_reward = policy_net(state)
            return expected_reward.max(1)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]],
示例#20
0
class DQNAgent:
    def __init__(self):
        # self.config = config
        self.gamma = 0.4

        # self.logger = logging.getLogger("DQNAgent")

        self.screen_width = 600

        # define models (policy and target)
        self.policy_model = DQN()
        self.target_model = DQN()

        # define memory
        self.memory = ReplayMemory()

        # define loss
        self.loss = HuberLoss()

        # define optimizer
        self.optim = torch.optim.Adam(self.policy_model.parameters(), lr=0.01)

        # define environment
        self.env = PyCar()  #TODO
        # self.cartpole = PyCar(self.screen_width)

        # initialize counter
        self.current_episode = 0
        self.current_iteration = 0
        self.episode_durations = []

        self.batch_size = 250

        # set cuda flag
        self.is_cuda = torch.cuda.is_available()

        self.cuda = self.is_cuda

        if self.cuda:
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.policy_model = self.policy_model.to(self.device)
        self.target_model = self.target_model.to(self.device)
        self.loss = self.loss.to(self.device)

        # Initialize Target model with policy model state dict
        self.target_model.load_state_dict(self.policy_model.state_dict())
        self.target_model.eval()

        self.savepath = "/home/sk002/Documents/RL-Project/model/"

    def run(self):
        """
        This function will the operator
        :return:
        """
        try:
            self.train()

        except KeyboardInterrupt as e:
            print(e)

    def select_action(self, state):
        """
        The action selection function, it either uses the model to choose an action or samples one uniformly.
        :param state: current state of the model
        :return:
        """

        self.eps_start = 0.95
        self.eps_end = 0.65
        self.eps_decay = 2000

        if self.cuda:
            state = state.cuda()
        sample = random.random()
        eps_threshold = self.eps_start - (
            self.eps_start - self.eps_end) * math.exp(
                -1. * self.current_iteration / self.eps_decay)
        self.current_iteration += 1
        # print("Eps thresh: ", eps_threshold)
        if sample < eps_threshold:
            # print("Model step")
            with torch.no_grad():
                return self.policy_model(state).max(1)[1].view(1,
                                                               1)  # size (1,1)
        else:
            # print("Random step")
            return torch.tensor([[random.randrange(5)]],
                                device=self.device,
                                dtype=torch.long)

    def get_action(self, state):

        if self.cuda:
            state = state.cuda()
        with torch.no_grad():
            return self.policy_model(state).max(1)[1].view(1, 1)  # size (1,1)

    def optimize_policy_model(self):
        """
        performs a single step of optimization for the policy model
        :return:
        """
        if self.memory.length() < self.batch_size:
            return
        # sample a batch
        transitions = self.memory.sample_batch(self.batch_size)

        one_batch = Transition(*zip(*transitions))

        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, one_batch.next_state)),
                                      device=self.device,
                                      dtype=torch.uint8)
        non_final_next_states = torch.cat(
            [s for s in one_batch.next_state if s is not None])

        state_batch = torch.cat(one_batch.state)
        action_batch = torch.cat(one_batch.action)
        reward_batch = torch.cat(one_batch.reward)

        state_batch = state_batch.to(self.device)
        non_final_next_states = non_final_next_states.to(self.device)

        curr_state_values = self.policy_model(state_batch)  # [128, 2]
        curr_state_action_values = curr_state_values.gather(
            1, action_batch)  # [128, 1]

        next_state_values = torch.zeros(self.batch_size,
                                        device=self.device)  # [128]
        next_state_values[non_final_mask] = self.target_model(
            non_final_next_states).max(1)[0].detach()  # [< 128]

        # Get the expected Q values
        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch  # [128]
        # compute loss: temporal difference error
        loss = self.loss(curr_state_action_values,
                         expected_state_action_values.unsqueeze(1))

        # optimizer step
        self.optim.zero_grad()
        loss.backward()
        for param in self.policy_model.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optim.step()

        return loss

    def train(self):
        """
        Training loop based on the number of episodes
        :return:
        """

        self.num_episodes = 1000
        self.target_update = 5

        for episode in tqdm(range(self.current_episode, self.num_episodes)):
            self.current_episode = episode
            # reset environment
            self.env.reset_game()
            self.train_one_epoch()
            # The target network has its weights kept frozen most of the time
            if self.current_episode % self.target_update == 0:
                self.target_model.load_state_dict(
                    self.policy_model.state_dict())

            if self.current_episode % 50 == 0:
                torch.save(
                    self.policy_model.state_dict(), self.savepath +
                    "policy_epoch" + str(self.current_episode) + ".pth")
                torch.save(
                    self.target_model.state_dict(), self.savepath +
                    "target_epoch" + str(self.current_episode) + ".pth")

    def train_one_epoch(self):
        """
        One episode of training; it samples an action, observe next screen and optimize the model once
        :return:
        """
        episode_duration = 0

        curr_state = torch.Tensor(self.env.get_state()).permute(2, 0,
                                                                1).unsqueeze(0)

        while (1):
            # time.sleep(0.1)

            episode_duration += 1
            # select action
            action = self.select_action(curr_state)

            images, reward, done, score = self.env.step(action.item())  #TODO

            if self.cuda:
                reward = torch.Tensor([reward]).to(self.device)
            else:
                reward = torch.Tensor([reward]).to(self.device)

            # assign next state
            if done:
                next_state = None
            else:
                next_state = torch.Tensor(images).permute(2, 0, 1).unsqueeze(
                    0)  #TODO

            # add this transition into memory
            self.memory.push_transition(curr_state, action, next_state, reward)

            curr_state = next_state

            # Policy model optimization step
            curr_loss = self.optimize_policy_model()
            if curr_loss is not None:
                if self.cuda:
                    curr_loss = curr_loss.cpu()

            if done:
                print(score)
                break

    def validate(self):

        curr_state = torch.Tensor(self.env.get_state()).permute(2, 0,
                                                                1).unsqueeze(0)

        while (1):
            # time.sleep(0.1)

            episode_duration += 1
            # select action
            action = self.get_action(curr_state)

            images, reward, done, score = self.env.step(action.item())  #TODO

            if self.cuda:
                reward = torch.Tensor([reward]).to(self.device)
            else:
                reward = torch.Tensor([reward]).to(self.device)

            # assign next state
            if done:
                next_state = None
            else:
                next_state = torch.Tensor(images).permute(2, 0, 1).unsqueeze(
                    0)  #TODO

            curr_state = next_state

            if done:
                print(score)
                break
示例#21
0
class Agent:
    def __init__(self,
                 env,
                 input_size,
                 output_size,
                 hidden_size,
                 max_cars=10,
                 max_passengers=10,
                 mix_hidden=32,
                 batch_size=128,
                 lr=0.001,
                 gamma=.999,
                 eps_start=0.9,
                 eps_end=0.05,
                 eps_decay=750,
                 replay_capacity=10000,
                 num_save=200,
                 num_episodes=10000,
                 mode="random",
                 training=False,
                 load_file=None):
        self.env = env
        self.orig_env = copy.deepcopy(env)
        self.grid_map = env.grid_map
        self.cars = env.grid_map.cars
        self.num_cars = len(self.cars)
        self.passengers = env.grid_map.passengers
        self.num_passengers = len(self.passengers)
        self.max_cars = max_cars
        self.max_passengers = max_passengers
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.eps_start = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay
        self.replay_capacity = replay_capacity
        self.num_episodes = num_episodes
        self.steps_done = 0
        self.lr = lr
        self.mode = mode
        self.num_save = num_save
        self.training = training
        self.algorithm = PairAlgorithm()
        self.episode_durations = []
        self.duration_matrix = np.zeros((self.max_passengers, self.max_cars))
        self.count_matrix = np.zeros((self.max_passengers, self.max_cars))
        self.loss_history = []

        self.memory = ReplayMemory(self.replay_capacity)

        self.device = torch.device("cpu")
        print("Device being used:", self.device)
        self.policy_net = DQN(self.input_size, self.output_size,
                              self.hidden_size).to(self.device)

        self.params = list(self.policy_net.parameters())

        if self.mode == "qmix":
            self.mixer = QMixer(self.input_size, self.max_passengers,
                                mix_hidden).to(self.device)
            self.params += list(self.mixer.parameters())

        if load_file:
            self.policy_net.load_state_dict(torch.load(load_file))
            if self.mode == "qmix":
                self.mixer.load_state_dict(torch.load("mixer_" + load_file))
                self.mixer.eval()
            self.policy_net.eval()
            self.load_file = "Pretrained_" + load_file
            print("Checkpoint loaded")
        else:
            self.load_file = self.mode + "_model_num_cars_" + str(self.num_cars) + "_num_passengers_" + str(self.num_passengers) + \
                    "_num_episodes_" + str(self.num_episodes) + "_hidden_size_" + str(self.hidden_size) + ".pth"

        self.optimizer = optim.RMSprop(self.params, lr=self.lr)
        #self.optimizer = optim.Adam(self.params, lr=self.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
        #self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 1500, gamma=0.1)

    def select_action(self, state):
        #Select action with epsilon greedy
        sample = random.random()

        eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * \
            math.exp(-1. * self.steps_done / self.eps_decay)

        print(eps_threshold)

        self.steps_done += 1

        if not self.training:
            eps_threshold = 0.0

        if sample > eps_threshold:
            # Choose best action
            with torch.no_grad():

                self.policy_net.eval()
                action = self.policy_net(state).view(
                    self.max_passengers,
                    self.max_cars)[:, :self.num_cars].max(1)[1].view(
                        1, self.max_passengers)
                action[0, self.num_passengers:] = self.max_cars
                return action

        else:
            #Choose random action
            action = torch.tensor([[
                random.randrange(self.num_cars)
                for car in range(self.max_passengers)
            ]],
                                  device=self.device,
                                  dtype=torch.long)
            action[0, self.num_passengers:] = self.max_cars
            return action

    def random_action(self, state):
        return torch.tensor([[
            random.randrange(self.num_cars)
            for car in range(self.num_passengers)
        ]],
                            device=self.device,
                            dtype=torch.long)

    def get_state(self):
        # Cars (px, py, 1=matched), Passengers(pickup_x, pickup_y, dest_x, dest_y, 1=matched)
        # Vector Size = 3*C + 5*P
        cars = self.cars
        passengers = self.passengers
        indicator_cars_vec = np.zeros(self.max_cars)
        indicator_passengers_vec = np.zeros(self.max_passengers)

        # Encode information about cars
        cars_vec = np.array([0] * (2 * self.max_cars))

        for i, car in enumerate(cars):
            cars_vec[2 * i:2 * i + 2] = [car.position[0], car.position[1]]
            indicator_cars_vec[i] = 1

        # Encode information about passengers
        passengers_vec = np.array([0] * (4 * self.max_passengers))
        for i, passenger in enumerate(passengers):
            passengers_vec[4 * i:4 * i + 4] = [
                passenger.pick_up_point[0], passenger.pick_up_point[1],
                passenger.drop_off_point[0], passenger.drop_off_point[1]
            ]
            indicator_passengers_vec[i] = 1

        return torch.tensor(np.concatenate(
            (cars_vec, indicator_cars_vec, passengers_vec,
             indicator_passengers_vec)),
                            device=self.device,
                            dtype=torch.float).unsqueeze(0)

    def train(self):

        duration_sum = 0.0

        for episode in range(self.num_episodes):

            self.reset_different_num()
            #self.reset()
            #self.reset_orig_env()

            state = self.get_state()

            if self.mode == "dqn" or self.mode == "qmix":
                action = self.select_action(state)

            elif self.mode == "random":
                action = self.random_action([state])

            elif self.mode == "greedy":
                action = [self.algorithm.greedy_fcfs(self.grid_map)]
                action = torch.tensor(action,
                                      device=self.device,
                                      dtype=torch.long)
                #print(action.size())
                #print(action[:,:self.num_passengers])

            reward, duration = self.env.step(action[:, :self.num_passengers],
                                             self.mode)

            if self.mode == "dqn":
                reward.extend([0] *
                              (self.max_passengers - self.num_passengers))

            self.episode_durations.append(duration)
            count = self.count_matrix[self.num_passengers - 1,
                                      self.num_cars - 1]
            self.duration_matrix[
                self.num_passengers - 1,
                self.num_cars - 1] = self.duration_matrix[
                    self.num_passengers - 1, self.num_cars -
                    1] * (count / (count + 1)) + duration / (count + 1)
            self.count_matrix[self.num_passengers - 1, self.num_cars - 1] += 1
            duration_sum += duration

            if self.training:
                self.memory.push(
                    state, action,
                    torch.tensor(reward, device=self.device,
                                 dtype=torch.float).unsqueeze(0))
                self.optimize_model()

                self.plot_durations(self.mode)
                self.plot_loss_history(self.mode)

            if self.training and episode % self.num_save == 0:
                torch.save(self.policy_net.state_dict(),
                           "episode_" + str(episode) + "_" + self.load_file)
                if self.mode == "qmix":
                    torch.save(
                        self.mixer.state_dict(),
                        "mixer_episode_" + str(episode) + "_" + self.load_file)
                print("Checkpoint saved")

            print("Episode: ", episode)

        if self.training:
            torch.save(self.policy_net.state_dict(), self.load_file)
            if self.mode == "qmix":
                torch.save(self.mixer.state_dict(), "mixer_" + self.load_file)
            print("Checkpoint saved")

        print("Average duration was ", duration_sum / self.num_episodes)
        print("Finished")
        np.save("Duration_matrix", self.duration_matrix)
        np.save("Count_matrix", self.count_matrix)
        print(self.duration_matrix)
        print(self.count_matrix)

    def reset(self):

        self.env.reset()
        self.grid_map = self.env.grid_map
        self.cars = self.env.grid_map.cars
        self.passengers = self.env.grid_map.passengers

    def reset_different_num(self):

        self.env.grid_map.cars = []
        self.env.grid_map.passengers = []
        self.env.grid_map.num_passengers = random.randint(
            1, self.max_passengers)
        self.env.grid_map.num_cars = random.randint(1, self.max_cars)
        self.env.grid_map.add_passenger(self.env.grid_map.num_passengers)
        self.env.grid_map.add_cars(self.env.grid_map.num_cars)

        self.grid_map = self.env.grid_map
        self.num_passengers = self.env.grid_map.num_passengers
        self.num_cars = self.env.grid_map.num_cars
        self.cars = self.env.grid_map.cars
        self.passengers = self.env.grid_map.passengers

    def reset_orig_env(self):

        self.env = copy.deepcopy(self.orig_env)
        self.grid_map = self.env.grid_map
        self.cars = self.env.grid_map.cars
        self.passengers = self.env.grid_map.passengers
        self.grid_map.init_zero_map_cost()

    def optimize_model(self):
        if len(self.memory) < self.batch_size:
            return

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        self.policy_net.train()

        q_values = self.policy_net(state_batch).view(self.batch_size,
                                                     self.max_passengers,
                                                     self.max_cars)
        q_values = torch.cat((q_values,
                              torch.zeros(
                                  (self.batch_size, self.max_passengers, 1),
                                  device=self.device)), 2)
        state_action_values = q_values.gather(
            2, action_batch.unsqueeze(2)).squeeze()

        # Compute the expected Q values
        expected_state_action_values = reward_batch

        # Compute Huber loss
        if self.mode == "dqn":
            loss = F.smooth_l1_loss(state_action_values,
                                    expected_state_action_values)
        elif self.mode == "qmix":
            self.mixer.train()
            chosen_action_qvals = self.mixer(state_action_values, state_batch)
            loss = F.smooth_l1_loss(chosen_action_qvals,
                                    reward_batch.view(-1, 1, 1))
            #loss = F.mse_loss(chosen_action_qvals, reward_batch.view(-1, 1, 1))

        self.loss_history.append(loss.item())

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()

        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def plot_durations(self, filename):
        print("Saving durations plot ...")
        plt.figure(2)
        plt.clf()

        total_steps = np.array(self.episode_durations)

        N = len(total_steps)
        window_size = 200
        if N < window_size:
            total_steps_smoothed = total_steps
        else:
            total_steps_smoothed = np.zeros(N - window_size)

            for i in range(N - window_size):
                window_steps = total_steps[i:i + window_size]
                total_steps_smoothed[i] = np.average(window_steps)

        plt.title('Episode Duration history')
        plt.xlabel('Episode')
        plt.ylabel('Duration')

        plt.plot(total_steps_smoothed)
        np.save("Duration_" + filename, total_steps_smoothed)
        #plt.savefig("Durations_history_" + filename)

    def plot_loss_history(self, filename):
        print("Saving loss history ...")
        plt.figure(2)
        plt.clf()
        #loss = torch.tensor(self.loss_history, dtype=torch.float)

        total_loss = np.array(self.loss_history)

        N = len(total_loss)
        window_size = 50
        if N < window_size:
            total_loss_smoothed = total_loss
        else:
            total_loss_smoothed = np.zeros(N - window_size)

            for i in range(N - window_size):
                window_steps = total_loss[i:i + window_size]
                total_loss_smoothed[i] = np.average(window_steps)

        plt.title('Loss history')
        plt.xlabel('Episodes')
        plt.ylabel('Loss')
        plt.plot(self.loss_history)
        np.save("Loss_" + filename, total_loss_smoothed)
示例#22
0
def train(args):
    device = torch.device("cuda" if args.gpu else "cpu")
    env = Environment(draw=False,
                      fps=args.fps,
                      debug=args.debug,
                      dist_to_pipe=args.dist_to_pipe,
                      dist_between_pipes=args.dist_between_pipes,
                      obs_this_pipe=args.obs_this_pipe)

    observation_space = env.get_observation_size_buffer()
    action_space = env.get_action_size()

    policy_network = DQN(observation_space, action_space).to(device)
    target_network = DQN(observation_space, action_space).to(device)

    optimizer = torch.optim.Adam(policy_network.parameters(), lr=args.lr)

    replay_buffer = ReplayMemory(args.replay_capacity)
    writer = SummaryWriter()

    if args.inference:
        target_network.load_checkpoint()

    best_reward = None
    iteration = 0
    total_reward = 0.0
    rewards = []
    state = env.reset()
    while True:
        epsilon = max(args.final_eps,
                      args.start_eps - iteration / args.eps_decay_final_step)

        iteration += 1
        episode_reward = None
        if np.random.rand() < epsilon:
            action = env.get_action_random()
        else:
            state_v = torch.tensor(np.array([state], copy=False)).to(device)
            q_vals_v = policy_network(state_v.float())
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        next_state, reward, done = env.step(action)
        total_reward += reward

        replay_buffer.push(state, action, next_state, reward, done)

        state = next_state

        if done:
            episode_reward = total_reward
            state = env.reset()
            total_reward = 0.0

        if episode_reward is not None:
            rewards.append(episode_reward)
            mean_reward = np.mean(rewards[-80:])
            print(
                f"Episode {iteration}:  eps {epsilon}  mean reward {mean_reward}  episode reward {episode_reward}"
            )

            writer.add_scalar("epsilon", epsilon, iteration)
            writer.add_scalar("mean_reward", mean_reward, iteration)
            writer.add_scalar("reward", episode_reward, iteration)

            if best_reward is None or best_reward < mean_reward:
                torch.save(policy_network.state_dict(),
                           f"./models/checkpoint_{iteration}")
                print(f"New best reward found: {best_reward} -> {mean_reward}")
                best_reward = mean_reward
            if mean_reward > args.goal_reward:
                print(f"Achieved in {iteration} steps.")
                break

        if len(replay_buffer) < args.replay_start_step:
            continue

        if iteration % args.target_update_iterations == 0:
            target_network.load_state_dict(policy_network.state_dict())

        optimizer.zero_grad()

        batch = replay_buffer.sample(args.batch_size)
        loss = calculate_loss(batch,
                              policy_network,
                              target_network,
                              args.gamma,
                              device=device)

        loss.backward()
        optimizer.step()
    writer.close()