def __init__(self,seed,nS,nA,nB,params,hidden_dims=(64,64),activation=F.leaky_relu): """ Network capable of processing any number of prior actions Num Categories: nA (check,fold,call,bet,raise) Num Betsizes: nB (various betsizes) """ super().__init__() self.activation = activation # self.seed = torch.manual_seed(seed) self.nS = nS self.nA = nA self.nB = nB self.hand_emb = Embedder(5,64) self.action_emb = Embedder(6,63) self.combined_output = nA - 2 + nB self.helper_functions = NetworkFunctions(self.nA,self.nB) self.preprocess = PreProcessHistory(params) self.max_length = 10 self.emb = 512 n_heads = 8 depth = 2 self.positional_emb = Embedder(self.max_length,128) self.lstm = nn.LSTM(self.emb, 256) # self.transformer = CTransformer(self.emb,n_heads,depth,self.max_length,self.combined_output,max_pool=False) self.mapping = params['mapping'] self.noise = GaussianNoise(is_relative_detach=True) self.fc1 = nn.Linear(128,hidden_dims[0]) self.fc2 = nn.Linear(hidden_dims[0],hidden_dims[1]) self.fc3 = nn.Linear(2560,self.combined_output)
def __init__(self, seed, nO, nA, nB, params, hidden_dims=(64, 64), activation=F.leaky_relu): super().__init__() self.activation = activation self.nO = nO self.nA = nA self.nB = nB self.combined_output = nA - 2 + nB self.maxlen = params['maxlen'] self.mapping = params['state_mapping'] self.device = params['device'] # self.emb = params['embedding_size'] self.helper_functions = NetworkFunctions(self.nA, self.nB) self.process_input = PreProcessLayer(params) self.lstm = nn.LSTM(1280, 128) self.policy_out = nn.Linear(1280, self.combined_output) self.noise = GaussianNoise(self.device) emb = params['transformer_in'] n_heads = 8 depth = 2 self.transformer = CTransformer(emb, n_heads, depth, self.maxlen, params['transformer_out']) self.dropout = nn.Dropout(0.5) self.value_output = nn.Linear(params['transformer_out'], 1) self.advantage_output = nn.Linear(params['transformer_out'], self.combined_output)
def __init__(self, seed, nS, nA, nB, params, hidden_dims=(64, 64), activation=F.leaky_relu): super().__init__() self.activation = activation self.nS = nS self.nA = nA self.nB = nB self.combined_output = nA - 2 + nB self.helper_functions = NetworkFunctions(self.nA, self.nB) self.maxlen = params['maxlen'] self.process_input = PreProcessLayer(params) # self.seed = torch.manual_seed(seed) self.mapping = params['mapping'] self.hand_emb = Embedder(5, 64) self.action_emb = Embedder(6, 64) self.betsize_emb = Embedder(self.nB, 64) self.noise = GaussianNoise() self.emb = 1248 n_heads = 8 depth = 2 self.lstm = nn.LSTM(self.emb, 128) # self.transformer = CTransformer(emb,n_heads,depth,self.max_length,self.nA) self.fc1 = nn.Linear(528, hidden_dims[0]) self.fc2 = nn.Linear(hidden_dims[0], hidden_dims[1]) self.fc3 = nn.Linear(1280, self.combined_output) self.dropout = nn.Dropout(0.5)
class FlatBetsizeActor(nn.Module): def __init__(self,seed,nS,nA,nB,params,hidden_dims=(64,64),activation=F.leaky_relu): """ Num Categories: nA (check,fold,call,bet,raise) Num Betsizes: nB (various betsizes) """ super().__init__() self.activation = activation self.nS = nS self.nA = nA self.nB = nB self.combined_output = nA - 2 + nB self.helper_functions = NetworkFunctions(self.nA,self.nB) self.mapping = params['mapping'] self.hand_emb = Embedder(5,64) self.action_emb = Embedder(6,64) self.betsize_emb = Embedder(self.nB,64) self.noise = GaussianNoise() self.fc1 = nn.Linear(129,hidden_dims[0]) self.fc2 = nn.Linear(hidden_dims[0],hidden_dims[1]) self.fc3 = nn.Linear(hidden_dims[1],self.combined_output) def forward(self,state,action_mask,betsize_mask): mask = combined_masks(action_mask,betsize_mask) x = state hand = x[:,self.mapping['state']['rank']].long() last_action = x[:,self.mapping['state']['previous_action']].long() previous_betsize = x[:,self.mapping['state']['previous_betsize']].float() if previous_betsize.dim() == 1: previous_betsize = previous_betsize.unsqueeze(1) hand = self.hand_emb(hand) last_action_emb = self.action_emb(last_action) # print('hand,last_action_emb,previous_betsize',hand.size(),last_action_emb.size(),previous_betsize.size()) x = torch.cat([hand,last_action_emb,previous_betsize],dim=-1) x = self.activation(self.fc1(x)) x = self.activation(self.fc2(x)) cateogry_logits = self.fc3(x) cateogry_logits = self.noise(cateogry_logits) action_soft = F.softmax(cateogry_logits,dim=-1) # print(action_soft.size(),mask.size()) action_probs = norm_frequencies(action_soft,mask) # action_probs = action_probs * mask # action_probs /= torch.sum(action_probs) m = Categorical(action_probs) action = m.sample() action_category,betsize_category = self.helper_functions.unwrap_action(action,last_action) # print('state',state) # print('action_category,betsize_category',action_category,betsize_category) outputs = { 'action':action, 'action_category':action_category, 'action_prob':m.log_prob(action), 'action_probs':action_probs, 'betsize':betsize_category } return outputs
def __init__(self,seed,nS,nA,nB,params,hidden_dims=(64,64),activation=F.leaky_relu): """ Num Categories: nA (check,fold,call,bet,raise) Num Betsizes: nB (various betsizes) """ super().__init__() self.activation = activation self.nS = nS self.nA = nA self.nB = nB self.combined_output = nA - 2 + nB self.helper_functions = NetworkFunctions(self.nA,self.nB) self.mapping = params['mapping'] self.hand_emb = Embedder(5,64) self.action_emb = Embedder(6,64) self.betsize_emb = Embedder(self.nB,64) self.noise = GaussianNoise() self.fc1 = nn.Linear(129,hidden_dims[0]) self.fc2 = nn.Linear(hidden_dims[0],hidden_dims[1]) self.fc3 = nn.Linear(hidden_dims[1],self.combined_output)
def __init__(self, seed, nS, nA, nB, params, hidden_dims=(64, 64), activation=F.leaky_relu): super().__init__() self.activation = activation self.nS = nS self.nA = nA self.nB = nB self.combined_output = nA - 2 + nB self.helper_functions = NetworkFunctions(self.nA, self.nB) self.maxlen = params['maxlen'] self.device = params['device'] self.epsilon = params['epsilon'] self.epsilon_weights = params['epsilon_weights'].to(self.device) self.process_input = PreProcessLayer(params) # self.seed = torch.manual_seed(seed) self.state_mapping = params['state_mapping'] self.action_emb = Embedder(Action.UNOPENED, 64) self.betsize_emb = Embedder(self.nB, 64) self.noise = GaussianNoise(self.device) self.emb = 1248 n_heads = 8 depth = 2 # self.attention = EncoderAttention(params['lstm_in'],params['lstm_out']) self.lstm = nn.LSTM(params['lstm_in'], params['lstm_out'], bidirectional=True) self.batchnorm = nn.BatchNorm1d(self.maxlen) # self.blocks = nn.Sequential( # IdentityBlock(hidden_dims=(2560,2560,512),activation=F.leaky_relu), # IdentityBlock(hidden_dims=(512,512,256),activation=F.leaky_relu), # ) self.fc_final = nn.Linear(5120, self.combined_output)
class CombinedNet(Network): def __init__(self, seed, nO, nA, nB, params, hidden_dims=(64, 64), activation=F.leaky_relu): super().__init__() self.activation = activation self.nO = nO self.nA = nA self.nB = nB self.combined_output = nA - 2 + nB self.maxlen = params['maxlen'] self.mapping = params['state_mapping'] self.device = params['device'] # self.emb = params['embedding_size'] self.helper_functions = NetworkFunctions(self.nA, self.nB) self.process_input = PreProcessLayer(params) self.lstm = nn.LSTM(1280, 128) self.policy_out = nn.Linear(1280, self.combined_output) self.noise = GaussianNoise(self.device) emb = params['transformer_in'] n_heads = 8 depth = 2 self.transformer = CTransformer(emb, n_heads, depth, self.maxlen, params['transformer_out']) self.dropout = nn.Dropout(0.5) self.value_output = nn.Linear(params['transformer_out'], 1) self.advantage_output = nn.Linear(params['transformer_out'], self.combined_output) def forward(self, state, action_mask, betsize_mask): x = torch.tensor(state, dtype=torch.float32).to(self.device) action_mask = torch.tensor(action_mask, dtype=torch.float).to(self.device) betsize_mask = torch.tensor(betsize_mask, dtype=torch.float).to(self.device) mask = combined_masks(action_mask, betsize_mask) out = self.process_input(x) # Actor B, M, c = out.size() n_padding = self.maxlen - M if n_padding < 0: h = out[:, -self.maxlen:, :] else: padding = torch.zeros(B, n_padding, out.size(-1)).to(self.device) h = torch.cat((out, padding), dim=1) lstm_out, _ = self.lstm(h) t_logits = self.policy_out(lstm_out.view(-1)) category_logits = self.noise(t_logits) action_soft = F.softmax(category_logits, dim=-1) action_probs = norm_frequencies(action_soft, mask) m = Categorical(action_probs) action = m.sample() action_category, betsize_category = self.helper_functions.unwrap_action( action, state[:, -1, self.mapping['last_action']]) outputs = { 'action': action.item(), 'action_category': action_category.item(), 'action_prob': m.log_prob(action), 'action_probs': action_probs, 'betsize': betsize_category.item() } # Critic q_input = self.transformer(out) a = self.advantage_output(q_input) v = self.value_output(q_input) v = v.expand_as(a) q = v + a - a.mean(-1, keepdim=True).expand_as(a) outputs['value'] = q.squeeze(0) return outputs
class HoldemBaseline(Network): def __init__(self, seed, nS, nA, nB, params, hidden_dims=(64, 64), activation=F.leaky_relu): super().__init__() self.activation = activation self.nS = nS self.nA = nA self.nB = nB self.combined_output = nA - 2 + nB self.helper_functions = NetworkFunctions(self.nA, self.nB) self.maxlen = params['maxlen'] self.process_input = PreProcessLayer(params) # self.seed = torch.manual_seed(seed) self.mapping = params['mapping'] self.hand_emb = Embedder(5, 64) self.action_emb = Embedder(6, 64) self.betsize_emb = Embedder(self.nB, 64) self.noise = GaussianNoise() self.emb = 1248 n_heads = 8 depth = 2 self.lstm = nn.LSTM(self.emb, 128) # self.transformer = CTransformer(emb,n_heads,depth,self.max_length,self.nA) self.fc1 = nn.Linear(528, hidden_dims[0]) self.fc2 = nn.Linear(hidden_dims[0], hidden_dims[1]) self.fc3 = nn.Linear(1280, self.combined_output) self.dropout = nn.Dropout(0.5) def forward(self, state, action_mask, betsize_mask): mask = combined_masks(action_mask, betsize_mask) x = state if x.dim() == 2: x = x.unsqueeze(0) out = self.process_input(x).unsqueeze(0) B, M, c = out.size() n_padding = max(self.maxlen - M, 0) padding = torch.zeros(B, n_padding, out.size(-1)) h = torch.cat((out, padding), dim=1) lstm_out, _ = self.lstm(h) t_logits = self.fc3(lstm_out.view(-1)) category_logits = self.noise(t_logits) action_soft = F.softmax(category_logits, dim=-1) action_probs = norm_frequencies(action_soft, mask) m = Categorical(action_probs) action = m.sample() action_category, betsize_category = self.helper_functions.unwrap_action( action, state[:, -1, self.mapping['state']['previous_action']]) outputs = { 'action': action, 'action_category': action_category, 'action_prob': m.log_prob(action), 'action_probs': action_probs, 'betsize': betsize_category } return outputs
class OmahaActor(Network): def __init__(self, seed, nS, nA, nB, params, hidden_dims=(64, 64), activation=F.leaky_relu): super().__init__() self.activation = activation self.nS = nS self.nA = nA self.nB = nB self.combined_output = nA - 2 + nB self.helper_functions = NetworkFunctions(self.nA, self.nB) self.maxlen = params['maxlen'] self.device = params['device'] self.epsilon = params['epsilon'] self.epsilon_weights = params['epsilon_weights'].to(self.device) self.process_input = PreProcessLayer(params) # self.seed = torch.manual_seed(seed) self.state_mapping = params['state_mapping'] self.action_emb = Embedder(Action.UNOPENED, 64) self.betsize_emb = Embedder(self.nB, 64) self.noise = GaussianNoise(self.device) self.emb = 1248 n_heads = 8 depth = 2 # self.attention = EncoderAttention(params['lstm_in'],params['lstm_out']) self.lstm = nn.LSTM(params['lstm_in'], params['lstm_out'], bidirectional=True) self.batchnorm = nn.BatchNorm1d(self.maxlen) # self.blocks = nn.Sequential( # IdentityBlock(hidden_dims=(2560,2560,512),activation=F.leaky_relu), # IdentityBlock(hidden_dims=(512,512,256),activation=F.leaky_relu), # ) self.fc_final = nn.Linear(5120, self.combined_output) def set_device(self, device): self.device = device self.process_input.set_device(device) def forward(self, state, action_mask, betsize_mask, target=False): """ state: B,M,39 """ if not isinstance(state, torch.Tensor): state = torch.tensor(state, dtype=torch.float32).to(self.device) action_mask = torch.tensor(action_mask, dtype=torch.float32).to(self.device) betsize_mask = torch.tensor(betsize_mask, dtype=torch.float32).to(self.device) mask = combined_masks(action_mask, betsize_mask) if target and np.random.random() < self.epsilon: B = state.size(0) # pick random legal move action_masked = self.epsilon_weights * mask action_probs = action_masked / action_masked.sum(-1).unsqueeze(-1) action = action_probs.multinomial(num_samples=1, replacement=False) action_prob = torch.zeros(B, 1) else: out = self.process_input(state) B, M, c = state.size() n_padding = self.maxlen - M if n_padding < 0: h = out[:, -self.maxlen:, :] else: padding = torch.zeros(B, n_padding, out.size(-1)).to(self.device) h = torch.cat((padding, out), dim=1) lstm_out, hidden_states = self.lstm(h) norm = self.batchnorm(lstm_out) # self.attention(out) # blocks_out = self.blocks(lstm_out.view(-1)) t_logits = self.fc_final(norm.view(B, -1)) category_logits = self.noise(t_logits) # skip connection # category_logits += h action_soft = F.softmax(category_logits, dim=-1) action_probs = norm_frequencies(action_soft, mask) m = Categorical(action_probs) action = m.sample() action_prob = m.log_prob(action) previous_action = torch.as_tensor( state[:, -1, self.state_mapping['last_action']]).to(self.device) action_category, betsize_category = self.helper_functions.batch_unwrap_action( action, previous_action) if B > 1: # batch training outputs = { 'action': action, 'action_category': action_category, 'action_prob': action_prob, 'action_probs': action_probs, 'betsize': betsize_category } else: # playing hand outputs = { 'action': action.item(), 'action_category': action_category.item(), 'action_prob': action_prob, 'action_probs': action_probs, 'betsize': betsize_category.item() } return outputs
class OmahaBatchActor(Network): def __init__(self, seed, nS, nA, nB, params, hidden_dims=(64, 64), activation=F.leaky_relu): super().__init__() self.activation = activation self.nS = nS self.nA = nA self.nB = nB self.combined_output = nA - 2 + nB self.helper_functions = NetworkFunctions(self.nA, self.nB) self.maxlen = params['maxlen'] self.device = params['device'] self.process_input = PreProcessLayer(params) # self.seed = torch.manual_seed(seed) self.state_mapping = params['state_mapping'] self.hand_emb = Embedder(5, 64) self.action_emb = Embedder(Action.UNOPENED, 64) self.betsize_emb = Embedder(self.nB, 64) self.noise = GaussianNoise(self.device) self.emb = 1248 n_heads = 8 depth = 2 self.lstm = nn.LSTM(1280, 128, bidirectional=True) self.batchnorm = nn.BatchNorm1d(self.maxlen) # self.blocks = nn.Sequential( # IdentityBlock(hidden_dims=(2560,2560,512),activation=F.leaky_relu), # IdentityBlock(hidden_dims=(512,512,256),activation=F.leaky_relu), # ) self.fc_final = nn.Linear(2560, self.combined_output) self.dropout = nn.Dropout(0.5) def forward(self, state, action_mask, betsize_mask): x = torch.tensor(state, dtype=torch.float32).to(self.device) action_mask = torch.tensor(action_mask, dtype=torch.float).to(self.device) betsize_mask = torch.tensor(betsize_mask, dtype=torch.float).to(self.device) mask = combined_masks(action_mask, betsize_mask) out = self.process_input(x) B, M, c = out.size() n_padding = self.maxlen - M if n_padding < 0: h = out[:, -self.maxlen:, :] else: padding = torch.zeros(B, n_padding, out.size(-1)).to(self.device) h = torch.cat((out, padding), dim=1) lstm_out, _ = self.lstm(h) norm = self.batchnorm(lstm_out) # blocks_out = self.blocks(lstm_out.view(-1)) t_logits = self.fc_final(norm.view(-1)) category_logits = self.noise(t_logits) action_soft = F.softmax(category_logits, dim=-1) action_probs = norm_frequencies(action_soft, mask) m = Categorical(action_probs) action = m.sample() action_category, betsize_category = self.helper_functions.unwrap_action( action, state[:, -1, self.state_mapping['last_action']]) outputs = { 'action': action.item(), 'action_category': action_category.item(), 'action_prob': m.log_prob(action), 'action_probs': action_probs, 'betsize': betsize_category.item() } return outputs
class FlatHistoricalActor(nn.Module): def __init__(self,seed,nS,nA,nB,params,hidden_dims=(64,64),activation=F.leaky_relu): """ Network capable of processing any number of prior actions Num Categories: nA (check,fold,call,bet,raise) Num Betsizes: nB (various betsizes) """ super().__init__() self.activation = activation # self.seed = torch.manual_seed(seed) self.nS = nS self.nA = nA self.nB = nB self.hand_emb = Embedder(5,64) self.action_emb = Embedder(6,63) self.combined_output = nA - 2 + nB self.helper_functions = NetworkFunctions(self.nA,self.nB) self.preprocess = PreProcessHistory(params) self.max_length = 10 self.emb = 512 n_heads = 8 depth = 2 self.positional_emb = Embedder(self.max_length,128) self.lstm = nn.LSTM(self.emb, 256) # self.transformer = CTransformer(self.emb,n_heads,depth,self.max_length,self.combined_output,max_pool=False) self.mapping = params['mapping'] self.noise = GaussianNoise(is_relative_detach=True) self.fc1 = nn.Linear(128,hidden_dims[0]) self.fc2 = nn.Linear(hidden_dims[0],hidden_dims[1]) self.fc3 = nn.Linear(2560,self.combined_output) def forward(self,state,action_mask,betsize_mask): mask = combined_masks(action_mask,betsize_mask) if mask.dim() > 1: mask = mask[-1] x = state if x.dim() == 2: x = x.unsqueeze(0) out = self.preprocess(x) M,C = out.size() n_padding = self.max_length - M padding = torch.zeros(n_padding,out.size(-1)) h = torch.cat((out,padding),dim=0).unsqueeze(0) # pos_emd = self.positional_emb(torch.arange(self.max_length)) # padding_mask_o = torch.ones(M,self.emb) # padding_mask_z = torch.zeros(n_padding,self.emb) # padding_mask = torch.cat((padding_mask_o,padding_mask_z),dim=0) # pos_emd = (pos_emd.view(-1) * padding_mask.view(-1)).view(h.size(0),self.emb) # h = h + pos_emd # x = (h + pos_emd).unsqueeze(0) # x = self.activation(self.fc1(h)) # x = self.activation(self.fc2(x)).view(-1) # t_logits = self.fc3(x).unsqueeze(0) x,_ = self.lstm(h) # x_stripped = (x.view(-1) * padding_mask.view(-1)).view(1,-1) t_logits = self.fc3(x.view(-1)) # t_logits = self.transformer(x) cateogry_logits = self.noise(t_logits) # distribution_inputs = F.log_softmax(cateogry_logits, dim=1) * mask action_soft = F.softmax(cateogry_logits,dim=-1) action_probs = norm_frequencies(action_soft,mask) last_action = state[M-1,self.mapping['state']['previous_action']].long().unsqueeze(-1) m = Categorical(action_probs) action = m.sample() action_category,betsize_category = self.helper_functions.unwrap_action(action,last_action) outputs = { 'action':action, 'action_category':action_category, 'action_prob':m.log_prob(action), 'action_probs':m.probs, 'betsize':betsize_category } return outputs
class FlatAC(nn.Module): def __init__(self,seed,nS,nA,nB,params,hidden_dims=(256,128),activation=F.leaky_relu): """ Network capable of processing any number of prior actions Num Categories: nA (check,fold,call,bet,raise) Num Betsizes: nB (various betsizes) """ super().__init__() self.activation = activation self.nS = nS self.nA = nA self.nB = nB self.combined_output = nA - 2 + nB self.helper_functions = NetworkFunctions(self.nA,self.nB) self.preprocess = PreProcessHistory(params) self.max_length = 10 emb = 128 n_heads = 8 depth = 2 self.positional_emb = Embedder(self.max_length,128) self.transformer = CTransformer(emb,n_heads,depth,self.max_length,self.combined_output,max_pool=False) self.seed = torch.manual_seed(seed) self.mapping = params['mapping'] self.noise = GaussianNoise(is_relative_detach=True) self.fc1 = nn.Linear(513,hidden_dims[0]) self.fc2 = nn.Linear(hidden_dims[0],hidden_dims[1]) self.fc3 = nn.Linear(1280,self.combined_output) self.value_output = nn.Linear(64,1) self.advantage_output = nn.Linear(64,self.combined_output) def forward(self,state,action_mask,betsize_mask): # last_state = state[-1].unsqueeze(0) mask = combined_masks(action_mask,betsize_mask) if mask.dim() > 1: mask = mask[-1] x = state M,C = x.size() out = self.preprocess(x) x = self.activation(self.fc1(out)) x = self.activation(self.fc2(x)) n_padding = self.max_length - M padding = torch.zeros(n_padding,out.size(-1)) h = torch.cat((out,padding),dim=0) pos_emd = self.positional_emb(torch.arange(self.max_length)) h = h + pos_emd # x = (h + pos_emd).unsqueeze(0) t_logits = self.fc3(h.view(-1)).unsqueeze(0) # t_logits = self.transformer(x) cateogry_logits = self.noise(t_logits) # distribution_inputs = F.log_softmax(cateogry_logits, dim=1) * mask action_soft = F.softmax(cateogry_logits,dim=-1) action_probs = norm_frequencies(action_soft,mask) last_action = state[-1,self.mapping['state']['previous_action']].long().unsqueeze(-1) m = Categorical(action_probs) action = m.sample() action_category,betsize_category = self.helper_functions.unwrap_action(action,last_action) q_input = x.view(M,-1) a = self.advantage_output(q_input) v = self.value_output(q_input) v = v.expand_as(a) q = v + a - a.mean(1,keepdim=True).expand_as(a) outputs = { 'action':action, 'action_category':action_category, 'action_prob':m.log_prob(action), 'action_probs':m.probs, 'betsize':betsize_category, 'value':q } return outputs
class OmahaActor(Network): def __init__(self, seed, nS, nA, nB, params, hidden_dims=(64, 64), activation=F.leaky_relu): super().__init__() self.activation = activation self.nS = nS self.nA = nA self.nB = nB self.combined_output = nA - 2 + nB self.helper_functions = NetworkFunctions(self.nA, self.nB) self.maxlen = params['maxlen'] self.device = params['device'] self.process_input = PreProcessLayer(params) # self.seed = torch.manual_seed(seed) self.state_mapping = params['state_mapping'] self.action_emb = Embedder(Action.UNOPENED, 64) self.betsize_emb = Embedder(self.nB, 64) self.noise = GaussianNoise(self.device) self.emb = 1248 n_heads = 8 depth = 2 # self.attention = EncoderAttention(params['lstm_in'],params['lstm_out']) self.lstm = nn.LSTM(params['lstm_in'], params['lstm_out'], bidirectional=True) self.batchnorm = nn.BatchNorm1d(self.maxlen) # self.blocks = nn.Sequential( # IdentityBlock(hidden_dims=(2560,2560,512),activation=F.leaky_relu), # IdentityBlock(hidden_dims=(512,512,256),activation=F.leaky_relu), # ) self.fc_final = nn.Linear(2560, self.combined_output) self.dropout = nn.Dropout(0.5) def forward(self, state, action_mask, betsize_mask): """ state: B,M,39 """ x = state if not isinstance(x, torch.Tensor): x = torch.tensor(x, dtype=torch.float32).to(self.device) action_mask = torch.tensor(action_mask, dtype=torch.float).to(self.device) betsize_mask = torch.tensor(betsize_mask, dtype=torch.float).to(self.device) mask = combined_masks(action_mask, betsize_mask) out = self.process_input(x) B, M, c = out.size() n_padding = self.maxlen - M if n_padding < 0: h = out[:, -self.maxlen:, :] else: padding = torch.zeros(B, n_padding, out.size(-1)).to(self.device) h = torch.cat((padding, out), dim=1) lstm_out, hidden_states = self.lstm(h) norm = self.batchnorm(lstm_out) # self.attention(out) # blocks_out = self.blocks(lstm_out.view(-1)) t_logits = self.fc_final(norm.view(B, -1)) category_logits = self.noise(t_logits) # skip connection # category_logits += h action_soft = F.softmax(category_logits, dim=-1) # if torch.cuda.is_available(): # action_probs = norm_frequencies(action_soft,mask.cuda()) # previous_action = torch.as_tensor(state[:,-1,self.state_mapping['last_action']]).cuda()#.to(self.device) # else: action_probs = norm_frequencies(action_soft, mask) previous_action = torch.as_tensor( state[:, -1, self.state_mapping['last_action']]).to(self.device) m = Categorical(action_probs) action = m.sample() action_category, betsize_category = self.helper_functions.batch_unwrap_action( action, previous_action) if B > 1: # batch training outputs = { 'action': action, 'action_category': action_category, 'action_prob': m.log_prob(action), 'action_probs': action_probs, 'betsize': betsize_category } else: # playing hand outputs = { 'action': action.item(), 'action_category': action_category.item(), 'action_prob': m.log_prob(action), 'action_probs': action_probs, 'betsize': betsize_category.item() } return outputs