예제 #1
0
class Tester():
    def __init__(self, render_flag):
        self.model = DDQN(36, 36)
        self.render_flag = render_flag
        self.width = 6
        self.height = 6
        self.env = MineSweeper(self.width, self.height, 6)
        if (self.render_flag):
            self.renderer = Render(self.env.state)
        self.load_models(20000)

    def get_action(self, state):
        state = state.flatten()
        mask = (1 - self.env.fog).flatten()
        action = self.model.act(state, mask)
        return action

    def load_models(self, number):
        path = "pre-trained\ddqn_dnn" + str(number) + ".pth"
        dict = torch.load(path)
        self.model.load_state_dict(dict['current_state_dict'])
        self.model.epsilon = 0

    def do_step(self, action):
        i = int(action / self.width)
        j = action % self.width

        if (self.render_flag):
            self.renderer.state = self.env.state
            self.renderer.draw()
            self.renderer.bugfix()
        next_state, terminal, reward = self.env.choose(i, j)
        return next_state, terminal, reward
예제 #2
0
class Driver():

    def __init__(self,width,height,bomb_no,render_flag):

        self.width = width
        self.height = height
        self.bomb_no = bomb_no
        self.box_count = width*height
        self.env = MineSweeper(self.width,self.height,self.bomb_no)
        self.current_model = DDQN(self.box_count,self.box_count)
        self.target_model = DDQN(self.box_count,self.box_count)
        self.target_model.eval()
        self.optimizer = torch.optim.Adam(self.current_model.parameters(),lr=0.003,weight_decay=1e-5)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer,step_size=2000,gamma=0.95)
        self.target_model.load_state_dict(self.current_model.state_dict())
        self.buffer = Buffer(100000)
        self.gamma = 0.99
        self.render_flag = render_flag
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.90
        self.reward_threshold = 0.12
        self.reward_step = 0.01
        self.batch_size = 4096
        self.tau = 5e-5
        self.log = open("./Logs/ddqn_log.txt",'w')

        if(self.render_flag):
            self.Render = Render(self.env.state)

    
    def load_models(self,number):
        path = "./pre-trained/ddqn_dnn"+str(number)+".pth"
        weights = torch.load(path)
        self.current_model.load_state_dict(weights['current_state_dict'])
        self.target_model.load_state_dict(weights['target_state_dict'])
        self.optimizer.load_state_dict(weights['optimizer_state_dict'])
        self.current_model.epsilon = weights['epsilon']


    ### Get an action from the DDQN model by supplying it State and Mask
    def get_action(self,state,mask):
        state = state.flatten()
        mask = mask.flatten()
        action = self.current_model.act(state,mask)
        return action

    ### Does the action and returns Next State, If terminal, Reward, Next Mask
    def do_step(self,action):
        i = int(action/self.width)
        j = action%self.width
        if(self.render_flag):
            self.Render.state = self.env.state
            self.Render.draw()
            self.Render.bugfix()
        next_state,terminal,reward = self.env.choose(i,j)
        next_fog = 1-self.env.fog
        return next_state,terminal,reward,next_fog
    
    ### Reward Based Epsilon Decay 
    def epsilon_update(self,avg_reward):
        if(avg_reward>self.reward_threshold):
            self.current_model.epsilon = max(self.epsilon_min,self.current_model.epsilon*self.epsilon_decay)
            self.reward_threshold+= self.reward_step
    
    def TD_Loss(self):
        ### Samples batch from buffer memory
        state,action,mask,reward,next_state,next_mask,terminal = self.buffer.sample(self.batch_size)

        ### Converts the variabls to tensors for processing by DDQN
        state      = Variable(FloatTensor(float32(state)))
        mask      = Variable(FloatTensor(float32(mask)))
        next_state = FloatTensor(float32(next_state))
        action     = LongTensor(float32(action))
        next_mask      = FloatTensor(float32(next_mask))
        reward     = FloatTensor(reward)
        done       = FloatTensor(terminal)


        ### Predicts Q value for present and next state with current and target model
        q_values      = self.current_model(state,mask)
        next_q_values = self.target_model(next_state,next_mask)

        # Calculates Loss:
        #    If not Terminal:
        #        Loss = (reward + gamma*Q_val(next_state)) - Q_val(current_state)
        #    If Terminal:
        #        Loss = reward - Q_val(current_state)

        q_value          = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        next_q_value     = next_q_values.max(1)[0]
        expected_q_value = reward + self.gamma * next_q_value * (1 - done)
        loss = (q_value - expected_q_value.detach()).pow(2).mean()
        loss_print = loss.item()    

        # Propagates the Loss
        self.optimizer.zero_grad()
        loss.backward()

        self.optimizer.step()
        self.scheduler.step()

        for target_param, local_param in zip(self.target_model.parameters(), self.current_model.parameters()):
            target_param.data.copy_(self.tau*local_param.data + (1.0-self.tau)*target_param.data)
        return loss_print

    def save_checkpoints(self,batch_no):
        path = "./pre-trained/ddqn_dnn"+str(batch_no)+".pth"
        torch.save({
            'epoch': batch_no,
            'current_state_dict': self.current_model.state_dict(),
            'target_state_dict' : self.target_model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'epsilon':self.current_model.epsilon
        }, path)

    def save_logs(self,batch_no,avg_reward,loss,wins):
        res = [
                    str(batch_no),
                    "\tAvg Reward: ",
                    str(avg_reward),
                    "\t Loss: ",
                    str(loss),
                    "\t Wins: ", 
                    str(wins),
                    "\t Epsilon: ",
                    str(self.current_model.epsilon)
        ]
        log_line = " ".join(res)
        print(log_line)
        self.log.write(log_line+"\n")
        self.log.flush()
예제 #3
0
            timestep += 1

            if timestep % 500000 == 1:
                epsilon = max(epsilon / 2., 0.005)
                qnet.set_epsilon(epsilon)
                tf.summary.scalar("parameters/epsilon", epsilon, step=timestep)
                writer.flush()
            obs_net = make_obs_network(obs, memory)
            if args.predictor:
                if timestep > args.startup_steps:
                    obs_hist, obs_pres, last_action = make_data_predictor(
                        obs, memory, action_shape)
                    iu, ic, m = pred.forward(obs_hist, obs_pres, last_action)
            if model_type == "CNN":
                obs_net = obs_net / 255.
            action = qnet.act(obs_net)
            new_obs, reward, done, _ = env.step(action)
            if args.render:
                env.render()
            if model_type == "CNN":
                new_obs = make_obs_memory(new_obs)

            memory.push(obs=obs,
                        action=action,
                        reward=reward,
                        done=done,
                        new_obs=new_obs)
            obs = new_obs

            if timestep > args.startup_steps:
                if timestep % args.update_interval == 0: