示例#1
0
    def forward(self, state, action_mask, betsize_mask):
        mask = combined_masks(action_mask, betsize_mask)
        x = state
        if x.dim() == 2:
            x = x.unsqueeze(0)
        out = self.process_input(x).unsqueeze(0)
        B, M, c = out.size()
        n_padding = max(self.maxlen - M, 0)
        padding = torch.zeros(B, n_padding, out.size(-1))
        h = torch.cat((out, padding), dim=1)
        lstm_out, _ = self.lstm(h)
        t_logits = self.fc3(lstm_out.view(-1))
        category_logits = self.noise(t_logits)

        action_soft = F.softmax(category_logits, dim=-1)
        action_probs = norm_frequencies(action_soft, mask)
        m = Categorical(action_probs)
        action = m.sample()

        action_category, betsize_category = self.helper_functions.unwrap_action(
            action, state[:, -1, self.mapping['state']['previous_action']])
        outputs = {
            'action': action,
            'action_category': action_category,
            'action_prob': m.log_prob(action),
            'action_probs': action_probs,
            'betsize': betsize_category
        }
        return outputs
示例#2
0
    def forward(self,state,mask):
        x = state
        if not isinstance(state,torch.Tensor):
            x = torch.tensor(x,dtype=torch.float32) #device = self.device,
            x = x.unsqueeze(0)
        # print(x)
        # print(self.mapping['state']['rank'])
        # print(self.mapping['state']['previous_action'])
        # print(x[:,self.mapping['state']['rank']])
        # print(x[:,self.mapping['state']['previous_action']])
        hand = x[:,self.mapping['state']['rank']].long()
        last_action = x[:,self.mapping['state']['previous_action']].long()
        hand = self.hand_emb(hand)
        last_action = self.action_emb(last_action)
        x = torch.cat([hand,last_action],dim=-1)
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.fc3(x)
        action_logits = self.noise(x)
        action_soft = F.softmax(action_logits,dim=-1)
        action_probs = norm_frequencies(action_soft,mask)
        m = Categorical(action_probs)
        action = m.sample()

        outputs = {
            'action':action,
            'action_prob':m.log_prob(action),
            'action_probs':action_probs}
        return outputs
示例#3
0
    def forward(self,state,mask,betsize_mask):
        x = state
        M,c = x.size()
        hand = x[:,self.mapping['state']['rank']].long()
        last_action = x[:,self.mapping['state']['previous_action']].long()
        # previous_betsize = x[:,self.mapping['state']['previous_betsize']].float().unsqueeze(0)
        hand = self.hand_emb(hand)
        embedded_action = self.action_emb(last_action)
        # print(hand.size(),embedded_action.size(),previous_betsize.size())
        # x = torch.cat([hand,embedded_action,previous_betsize],dim=-1)
        x = torch.cat([hand,embedded_action],dim=-1)
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        category_logits = self.fc3(x)
        category_logits = self.noise(category_logits)
        action_soft = F.softmax(category_logits,dim=-1)
        action_probs = norm_frequencies(action_soft,mask)
        # with torch.no_grad():
        #     action_masked = action_soft * mask
        #     action_probs =  action_masked / action_masked.sum(-1).unsqueeze(1)
        m = Categorical(action_probs)
        action = m.sample()
        # Check which category it is
        # betsize = torch.tensor([-1])
        # betsize_prob = torch.tensor([-1]).float()
        # betsize_probs = torch.Tensor(self.nA).fill_(-1).unsqueeze(0).float()
        # # print('action',action)
        # # print('betsize_mask',betsize_mask)
        # if action > 2:
        # generate betsize
        b = self.activation(self.bfc1(x))
        b = self.activation(self.bfc2(b))
        b = self.bfc3(b)
        betsize_logits = self.noise(b)
        # print('betsize_logits',betsize_logits)
        betsize_probs = F.softmax(betsize_logits,dim=-1)
        # print('betsize_probs',betsize_probs)
        if betsize_mask.sum(-1) == 0:
            betsize_mask = torch.ones(M,self.nA)
        # with torch.no_grad():
        mask_betsize_probs = betsize_probs * betsize_mask
        # print('mask_betsize_probs',mask_betsize_probs)
        norm_betsize_probs = mask_betsize_probs / mask_betsize_probs.sum(-1).unsqueeze(1)
        # print('mask_betsize_probs',mask_betsize_probs)
        b = Categorical(norm_betsize_probs)
        betsize = b.sample()
        betsize_prob = b.log_prob(betsize)

        # print('betsize',betsize)
        # print('betsize_prob',betsize_prob)
        # print('betsize_probs',betsize_probs)
        outputs = {
            'action':action,
            'action_prob':m.log_prob(action),
            'action_probs':action_probs,
            'action_category':action,
            'betsize':betsize,
            'betsize_prob':betsize_prob,
            'betsize_probs':betsize_probs}
        return outputs
示例#4
0
 def forward(self, state, action_mask, betsize_mask):
     """
     state: B,M,39
     """
     x = state
     if not isinstance(x, torch.Tensor):
         x = torch.tensor(x, dtype=torch.float32).to(self.device)
         action_mask = torch.tensor(action_mask,
                                    dtype=torch.float).to(self.device)
         betsize_mask = torch.tensor(betsize_mask,
                                     dtype=torch.float).to(self.device)
     mask = combined_masks(action_mask, betsize_mask)
     out = self.process_input(x)
     B, M, c = out.size()
     n_padding = self.maxlen - M
     if n_padding < 0:
         h = out[:, -self.maxlen:, :]
     else:
         padding = torch.zeros(B, n_padding, out.size(-1)).to(self.device)
         h = torch.cat((padding, out), dim=1)
     lstm_out, hidden_states = self.lstm(h)
     norm = self.batchnorm(lstm_out)
     # self.attention(out)
     # blocks_out = self.blocks(lstm_out.view(-1))
     t_logits = self.fc_final(norm.view(B, -1))
     category_logits = self.noise(t_logits)
     # skip connection
     # category_logits += h
     action_soft = F.softmax(category_logits, dim=-1)
     # if torch.cuda.is_available():
     #     action_probs = norm_frequencies(action_soft,mask.cuda())
     #     previous_action = torch.as_tensor(state[:,-1,self.state_mapping['last_action']]).cuda()#.to(self.device)
     # else:
     action_probs = norm_frequencies(action_soft, mask)
     previous_action = torch.as_tensor(
         state[:, -1, self.state_mapping['last_action']]).to(self.device)
     m = Categorical(action_probs)
     action = m.sample()
     action_category, betsize_category = self.helper_functions.batch_unwrap_action(
         action, previous_action)
     if B > 1:
         # batch training
         outputs = {
             'action': action,
             'action_category': action_category,
             'action_prob': m.log_prob(action),
             'action_probs': action_probs,
             'betsize': betsize_category
         }
     else:
         # playing hand
         outputs = {
             'action': action.item(),
             'action_category': action_category.item(),
             'action_prob': m.log_prob(action),
             'action_probs': action_probs,
             'betsize': betsize_category.item()
         }
     return outputs
示例#5
0
 def forward(self,state,action_mask,betsize_mask):
     mask = combined_masks(action_mask,betsize_mask)
     if mask.dim() > 1:
         mask = mask[-1]
     x = state
     if x.dim() == 2:
         x = x.unsqueeze(0)
     out = self.preprocess(x)
     M,C = out.size()
     n_padding = self.max_length - M
     padding = torch.zeros(n_padding,out.size(-1))
     h = torch.cat((out,padding),dim=0).unsqueeze(0)
     # pos_emd = self.positional_emb(torch.arange(self.max_length))
     # padding_mask_o = torch.ones(M,self.emb)
     # padding_mask_z = torch.zeros(n_padding,self.emb)
     # padding_mask = torch.cat((padding_mask_o,padding_mask_z),dim=0)
     # pos_emd = (pos_emd.view(-1) * padding_mask.view(-1)).view(h.size(0),self.emb)
     # h = h + pos_emd
     # x = (h + pos_emd).unsqueeze(0)
     # x = self.activation(self.fc1(h))
     # x = self.activation(self.fc2(x)).view(-1)
     # t_logits = self.fc3(x).unsqueeze(0)
     x,_ = self.lstm(h)
     # x_stripped = (x.view(-1) * padding_mask.view(-1)).view(1,-1)
     t_logits = self.fc3(x.view(-1))
     # t_logits = self.transformer(x)
     cateogry_logits = self.noise(t_logits)
     # distribution_inputs = F.log_softmax(cateogry_logits, dim=1) * mask
     action_soft = F.softmax(cateogry_logits,dim=-1)
     action_probs = norm_frequencies(action_soft,mask)
     last_action = state[M-1,self.mapping['state']['previous_action']].long().unsqueeze(-1)
     m = Categorical(action_probs)
     action = m.sample()
     action_category,betsize_category = self.helper_functions.unwrap_action(action,last_action)
     
     outputs = {
         'action':action,
         'action_category':action_category,
         'action_prob':m.log_prob(action),
         'action_probs':m.probs,
         'betsize':betsize_category
         }
     return outputs
示例#6
0
    def forward(self, state, action_mask, betsize_mask):
        x = torch.tensor(state, dtype=torch.float32).to(self.device)
        action_mask = torch.tensor(action_mask,
                                   dtype=torch.float).to(self.device)
        betsize_mask = torch.tensor(betsize_mask,
                                    dtype=torch.float).to(self.device)
        mask = combined_masks(action_mask, betsize_mask)
        out = self.process_input(x)
        # Actor
        B, M, c = out.size()
        n_padding = self.maxlen - M
        if n_padding < 0:
            h = out[:, -self.maxlen:, :]
        else:
            padding = torch.zeros(B, n_padding, out.size(-1)).to(self.device)
            h = torch.cat((out, padding), dim=1)
        lstm_out, _ = self.lstm(h)
        t_logits = self.policy_out(lstm_out.view(-1))
        category_logits = self.noise(t_logits)

        action_soft = F.softmax(category_logits, dim=-1)
        action_probs = norm_frequencies(action_soft, mask)
        m = Categorical(action_probs)
        action = m.sample()

        action_category, betsize_category = self.helper_functions.unwrap_action(
            action, state[:, -1, self.mapping['last_action']])
        outputs = {
            'action': action.item(),
            'action_category': action_category.item(),
            'action_prob': m.log_prob(action),
            'action_probs': action_probs,
            'betsize': betsize_category.item()
        }
        # Critic
        q_input = self.transformer(out)
        a = self.advantage_output(q_input)
        v = self.value_output(q_input)
        v = v.expand_as(a)
        q = v + a - a.mean(-1, keepdim=True).expand_as(a)
        outputs['value'] = q.squeeze(0)
        return outputs
示例#7
0
    def forward(self,state,action_mask,betsize_mask):
        # last_state = state[-1].unsqueeze(0)
        mask = combined_masks(action_mask,betsize_mask)
        if mask.dim() > 1:
            mask = mask[-1]
        x = state
        M,C = x.size()
        out = self.preprocess(x)
        x = self.activation(self.fc1(out))
        x = self.activation(self.fc2(x))
        n_padding = self.max_length - M
        padding = torch.zeros(n_padding,out.size(-1))
        h = torch.cat((out,padding),dim=0)
        pos_emd = self.positional_emb(torch.arange(self.max_length))
        h = h + pos_emd
        # x = (h + pos_emd).unsqueeze(0)
        t_logits = self.fc3(h.view(-1)).unsqueeze(0)
        # t_logits = self.transformer(x)
        cateogry_logits = self.noise(t_logits)
        # distribution_inputs = F.log_softmax(cateogry_logits, dim=1) * mask
        action_soft = F.softmax(cateogry_logits,dim=-1)
        action_probs = norm_frequencies(action_soft,mask)
        last_action = state[-1,self.mapping['state']['previous_action']].long().unsqueeze(-1)
        m = Categorical(action_probs)
        action = m.sample()
        action_category,betsize_category = self.helper_functions.unwrap_action(action,last_action)
        
        q_input = x.view(M,-1)
        a = self.advantage_output(q_input)
        v = self.value_output(q_input)
        v = v.expand_as(a)
        q = v + a - a.mean(1,keepdim=True).expand_as(a)

        outputs = {
            'action':action,
            'action_category':action_category,
            'action_prob':m.log_prob(action),
            'action_probs':m.probs,
            'betsize':betsize_category,
            'value':q
            }
        return outputs
示例#8
0
    def forward(self, state, action_mask, betsize_mask):
        x = torch.tensor(state, dtype=torch.float32).to(self.device)
        action_mask = torch.tensor(action_mask,
                                   dtype=torch.float).to(self.device)
        betsize_mask = torch.tensor(betsize_mask,
                                    dtype=torch.float).to(self.device)
        mask = combined_masks(action_mask, betsize_mask)

        out = self.process_input(x)
        B, M, c = out.size()
        n_padding = self.maxlen - M
        if n_padding < 0:
            h = out[:, -self.maxlen:, :]
        else:
            padding = torch.zeros(B, n_padding, out.size(-1)).to(self.device)
            h = torch.cat((out, padding), dim=1)
        lstm_out, _ = self.lstm(h)
        norm = self.batchnorm(lstm_out)
        # blocks_out = self.blocks(lstm_out.view(-1))
        t_logits = self.fc_final(norm.view(-1))
        category_logits = self.noise(t_logits)

        action_soft = F.softmax(category_logits, dim=-1)
        action_probs = norm_frequencies(action_soft, mask)
        m = Categorical(action_probs)
        action = m.sample()

        action_category, betsize_category = self.helper_functions.unwrap_action(
            action, state[:, -1, self.state_mapping['last_action']])
        outputs = {
            'action': action.item(),
            'action_category': action_category.item(),
            'action_prob': m.log_prob(action),
            'action_probs': action_probs,
            'betsize': betsize_category.item()
        }
        return outputs
示例#9
0
    def forward(self,state,action_mask,betsize_mask):
        mask = combined_masks(action_mask,betsize_mask)
        x = state
        hand = x[:,self.mapping['state']['rank']].long()
        last_action = x[:,self.mapping['state']['previous_action']].long()
        previous_betsize = x[:,self.mapping['state']['previous_betsize']].float()
        if previous_betsize.dim() == 1:
            previous_betsize = previous_betsize.unsqueeze(1)
        hand = self.hand_emb(hand)
        last_action_emb = self.action_emb(last_action)
        # print('hand,last_action_emb,previous_betsize',hand.size(),last_action_emb.size(),previous_betsize.size())
        x = torch.cat([hand,last_action_emb,previous_betsize],dim=-1)
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        cateogry_logits = self.fc3(x)
        cateogry_logits = self.noise(cateogry_logits)
        action_soft = F.softmax(cateogry_logits,dim=-1)
        # print(action_soft.size(),mask.size())
        action_probs = norm_frequencies(action_soft,mask)
        # action_probs = action_probs * mask
        # action_probs /= torch.sum(action_probs)
        m = Categorical(action_probs)
        action = m.sample()

        action_category,betsize_category = self.helper_functions.unwrap_action(action,last_action)
        # print('state',state)
        # print('action_category,betsize_category',action_category,betsize_category)
        
        outputs = {
            'action':action,
            'action_category':action_category,
            'action_prob':m.log_prob(action),
            'action_probs':action_probs,
            'betsize':betsize_category
            }
        return outputs
示例#10
0
 def forward(self, state, action_mask, betsize_mask, target=False):
     """
     state: B,M,39
     """
     if not isinstance(state, torch.Tensor):
         state = torch.tensor(state, dtype=torch.float32).to(self.device)
         action_mask = torch.tensor(action_mask,
                                    dtype=torch.float32).to(self.device)
         betsize_mask = torch.tensor(betsize_mask,
                                     dtype=torch.float32).to(self.device)
     mask = combined_masks(action_mask, betsize_mask)
     if target and np.random.random() < self.epsilon:
         B = state.size(0)
         # pick random legal move
         action_masked = self.epsilon_weights * mask
         action_probs = action_masked / action_masked.sum(-1).unsqueeze(-1)
         action = action_probs.multinomial(num_samples=1, replacement=False)
         action_prob = torch.zeros(B, 1)
     else:
         out = self.process_input(state)
         B, M, c = state.size()
         n_padding = self.maxlen - M
         if n_padding < 0:
             h = out[:, -self.maxlen:, :]
         else:
             padding = torch.zeros(B, n_padding,
                                   out.size(-1)).to(self.device)
             h = torch.cat((padding, out), dim=1)
         lstm_out, hidden_states = self.lstm(h)
         norm = self.batchnorm(lstm_out)
         # self.attention(out)
         # blocks_out = self.blocks(lstm_out.view(-1))
         t_logits = self.fc_final(norm.view(B, -1))
         category_logits = self.noise(t_logits)
         # skip connection
         # category_logits += h
         action_soft = F.softmax(category_logits, dim=-1)
         action_probs = norm_frequencies(action_soft, mask)
         m = Categorical(action_probs)
         action = m.sample()
         action_prob = m.log_prob(action)
     previous_action = torch.as_tensor(
         state[:, -1, self.state_mapping['last_action']]).to(self.device)
     action_category, betsize_category = self.helper_functions.batch_unwrap_action(
         action, previous_action)
     if B > 1:
         # batch training
         outputs = {
             'action': action,
             'action_category': action_category,
             'action_prob': action_prob,
             'action_probs': action_probs,
             'betsize': betsize_category
         }
     else:
         # playing hand
         outputs = {
             'action': action.item(),
             'action_category': action_category.item(),
             'action_prob': action_prob,
             'action_probs': action_probs,
             'betsize': betsize_category.item()
         }
     return outputs