def select_action(self, state): state = transform(state).unsqueeze(0).to(device).float() with torch.no_grad(): alpha, beta = self.net(state)[0] action = alpha / (alpha + beta) action = action.squeeze().cpu().numpy() return action
def forward(self, inputs, action = None, z_out = False, xesn_out=False, future_out=False): if future_out: return self.future(inputs) if z_out: inputs = transform(inputs).unsqueeze(0).to(device).double() B = inputs.shape[0] _, _, _, _, zs = self.model.vae(inputs) zs = zs.view(B, -1) return zs if xesn_out: inputs = transform(inputs).unsqueeze(0).to(device).double() action = torch.from_numpy(action).unsqueeze(0).double().to(device) x_esn_input, x_esn = self.model(inputs, action) B = inputs.shape[0] S = torch.cat((x_esn_input, x_esn, torch.ones((B, 1)).double().to(device) ), dim=1) return S
def expand(self, curNode: TreeNode): if (curNode.game.getReward()[1] == True): return -1 policyInput = transform(curNode.game) policyInput = np.expand_dims(policyInput, axis=0) (intuitionProbs, intuitionValue, _) = self.intuitionPolicy(policyInput) intuitionProbs = intuitionProbs.numpy() intuitionProbs = np.squeeze(intuitionProbs) validMoves = curNode.game.getValidMoves() intuitionProbs = self.sanitizeActionProbs(intuitionProbs, validMoves) curNode.intuitionProbs = intuitionProbs for move in validMoves: nextState = curNode.game.getNextState(move) nextNode = TreeNode(nextState) curNode.edges[move] = TreeEdge( nextNode, move[0], move[1], intuitionProbs[self._flattenMove(move)], 0.) return intuitionValue.numpy()
if __name__ == "__main__": agent = Agent() agent.load_param() env = gym.make("CarRacing-v0") env = Env(env) running_score = [] state = env.reset() for i_ep in range(100): score = 0 state = env.reset() hidden = [torch.zeros(1, RSIZE).to(device).double() for _ in range(2)] for t in range(1000): state = transform(state).unsqueeze(0).to(device).double() action, hidden = agent.select_action(state, hidden) state, reward, done, die = env.step(action * np.array([2., 1., 1.]) + np.array([-1., 0., 0.])) if args.render: env.render() score += reward if done: break running_score.append(score) logger.info('Ep {}\tScore: {:.2f}\t'.format(i_ep, score)) logger.info('Avg Score: {} + {}'.format(np.mean(running_score), np.std(running_score)))