Пример #1
0
    def select_action(self, state):
        state = transform(state).unsqueeze(0).to(device).float()
        with torch.no_grad():
            alpha, beta = self.net(state)[0]
        action = alpha / (alpha + beta)

        action = action.squeeze().cpu().numpy()
        return action
 def forward(self, inputs, action = None, z_out = False, xesn_out=False, future_out=False):
     
     if future_out:
         return self.future(inputs)
     
     if z_out:
         inputs = transform(inputs).unsqueeze(0).to(device).double()
         B = inputs.shape[0]
         _, _, _, _, zs = self.model.vae(inputs)
         zs = zs.view(B, -1)
         return zs
     
     if xesn_out:
         inputs = transform(inputs).unsqueeze(0).to(device).double()
         action = torch.from_numpy(action).unsqueeze(0).double().to(device)
         x_esn_input, x_esn = self.model(inputs, action)
         B = inputs.shape[0]
         S = torch.cat((x_esn_input, x_esn, torch.ones((B, 1)).double().to(device) ), dim=1)
         return S
Пример #3
0
    def expand(self, curNode: TreeNode):
        if (curNode.game.getReward()[1] == True):
            return -1
        policyInput = transform(curNode.game)
        policyInput = np.expand_dims(policyInput, axis=0)
        (intuitionProbs, intuitionValue, _) = self.intuitionPolicy(policyInput)
        intuitionProbs = intuitionProbs.numpy()
        intuitionProbs = np.squeeze(intuitionProbs)
        validMoves = curNode.game.getValidMoves()
        intuitionProbs = self.sanitizeActionProbs(intuitionProbs, validMoves)
        curNode.intuitionProbs = intuitionProbs

        for move in validMoves:
            nextState = curNode.game.getNextState(move)
            nextNode = TreeNode(nextState)
            curNode.edges[move] = TreeEdge(
                nextNode, move[0], move[1],
                intuitionProbs[self._flattenMove(move)], 0.)

        return intuitionValue.numpy()
Пример #4
0
if __name__ == "__main__":
    agent = Agent()
    agent.load_param()
    env = gym.make("CarRacing-v0")
    env = Env(env)

    running_score = []
    state = env.reset()

    for i_ep in range(100):
        score = 0
        state = env.reset()
        hidden = [torch.zeros(1, RSIZE).to(device).double() for _ in range(2)]
        for t in range(1000):
            state = transform(state).unsqueeze(0).to(device).double()
            action, hidden = agent.select_action(state, hidden)
            state, reward, done, die = env.step(action *
                                                np.array([2., 1., 1.]) +
                                                np.array([-1., 0., 0.]))
            if args.render:
                env.render()
            score += reward
            if done:
                break

        running_score.append(score)

        logger.info('Ep {}\tScore: {:.2f}\t'.format(i_ep, score))
    logger.info('Avg Score: {} + {}'.format(np.mean(running_score),
                                            np.std(running_score)))