def shrink_M_to_1(output, loss_type, N, num_proto, multi_policy_proto): batch_size = output.shape[0] output_dim = output.data.shape[1] if 'max_out' in loss_type: first_pos = output.index_select( 1, Variable(torch.arange(0, N).long().cuda())) second_neg = output.index_select( 1, Variable(torch.arange(N, output_dim).long().cuda())) second_max_neg, _ = torch.max(second_neg, dim=1, keepdim=True) output = torch.cat((first_pos, second_max_neg), dim=1) else: if num_proto > 1: if multi_policy_proto == 'max_softmax': first = output[:, :(N * num_proto)].contiguous().view( batch_size, N, num_proto) first_max, _ = torch.max(first, dim=2) second = output[:, (N * num_proto):] pred = torch.cat((first_max, second), dim=1) output = F.softmax(pred, dim=1) else: assert False output = F.softmax(output, dim=1) first_pos = output.index_select( 1, Variable(torch.arange(0, N).long().cuda())) second_neg = output.index_select( 1, Variable(torch.arange(N, output_dim).long().cuda())) prob_neg = torch.sum(second_neg, 1, keepdim=True) output = torch.cat((first_pos, prob_neg), dim=1) return output
def categorical(mean, temp): g = -torch.log(1e-10 - torch.log(1e-10+Variable(mean.data.new(mean.size()).uniform_()))) if mean.ndim != 3: return F.softmax((torch.log(mean + 1e-10) + g)/temp) else: shape = (mean.size()[0] * mean.size()[1], mean.size(2)) samples = F.softmax(((torch.log(mean + 1e-10) + g)/temp).view(shape)) return samples.view_as(mean)
def masked_softmax(x, valid_len): """Perform softmax by filtering out some elements.""" # x: 3-D tensor, valid_len: 1-D or 2-D tensor if valid_len is None: return fn.softmax(x, dim=-1) else: shape = x.shape if valid_len.dim() == 1: valid_len = torch.repeat_interleave(valid_len, repeats=shape[1], dim=0) else: valid_len = valid_len.reshape(-1) # Fill masked elements with a large negative, whose exp is 0 x = sequence_mask(x.reshape(-1, shape[-1]), valid_len, value=-1e6) return fn.softmax(x.reshape(shape), dim=-1)
def main(): # Read sentences sentences = readFile("words2.txt") print(sentences) # Make uniq words list words = [] uniqWords = [] for sentence in sentences: for word in sentence: words.append(word) if word not in uniqWords: uniqWords.append(word) print(uniqWords) uniqWordSize = len(uniqWords) # Make trainPairs trainPairs = trainGenerator(sentences, uniqWords) dims = 5 W1 = Variable(torch.randn(dims, uniqWordSize).float(), requires_grad=True) W2 = Variable(torch.randn(uniqWordSize, dims).float(), requires_grad=True) epo = 1001 for i in range(epo): avg_loss = 0 samples = 0 for x, y in trainPairs: x = Variable(torch.from_numpy(x)).float() y = Variable(torch.from_numpy(np.array([y])).long()) samples += len(y) a1 = torch.matmul(W1, x) a2 = torch.matmul(W2, a1) logSoftmax = F.log_softmax(a2, dim=0) loss = F.nll_loss(logSoftmax.view(1, -1), y) loss.backward() avg_loss += loss.item() W1.data -= 0.002 * W1.grad.data W2.data -= 0.002 * W2.grad.data W1.grad.data.zero_() W2.grad.data.zero_() if i != 0 and 100 < i and i % 100 == 0: print(avg_loss / samples) parisVecter = W1[:, uniqWords.index('paris')].data.numpy() context_to_predict = parisVecter hidden = Variable(torch.from_numpy(context_to_predict)).float() a = torch.matmul(W2, hidden) probs = F.softmax(a, dim=0).data.numpy() for context, prob in zip(uniqWords, probs): print(f'{context}: {prob:.2f}')
def forward(self, hidden, encoder_outputs, src_len=None): ''' :param hidden: previous hidden state of the decoder, in shape (layers*directions,B,H) :param encoder_outputs: encoder outputs from Encoder, in shape (T,B,H) :param src_len: used for masking. NoneType or tensor in shape (B) indicating sequence length :return attention energies in shape (B,T) ''' print(encoder_outputs.data.shape) max_len = encoder_outputs.size(0) this_batch_size = encoder_outputs.size(1) H = hidden.repeat(max_len,1,1).transpose(0,1) encoder_outputs = encoder_outputs.transpose(0,1) # [B*T*H] print(encoder_outputs.data.shape) attn_energies = self.score(H,encoder_outputs) # compute attention score if src_len is not None: mask = [] for b in range(src_len.size(0)): mask.append([0] * src_len[b].item() + [1] * (encoder_outputs.size(1) - src_len[b].item())) mask = cuda_(torch.ByteTensor(mask).unsqueeze(1)) # [B,1,T] attn_energies = attn_energies.masked_fill(mask, -1e18) return F.softmax(attn_energies).unsqueeze(1) # normalize with softmax
def on_epoch_end(self, last_target, last_output, **kwargs): if len(self.output) > 0: output = torch.cat(self.output) target = torch.cat(self.target) preds = F.softmax(output, dim=1) metric = auroc_score(preds, target) print(f'AUC: {metric:.5f}')
def forward(self, outputs, target_sizes): """ Perform the computation Parameters: outputs: raw outputs of the model target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch For evaluation, this must be the original image size (before any data augmentation) For visualization, this should be the image size after data augment, but before padding """ out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes'] assert len(out_logits) == len(target_sizes) assert target_sizes.shape[1] == 2 prob = F.softmax(out_logits, -1) scores, labels = prob[..., :-1].max(-1) # convert to [x0, y0, x1, y1] format # boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) boxes = box_convert(out_bbox, in_fmt="cxcywh", out_fmt="xyxy") # and from relative [0, 1] to absolute [0, height] coordinates img_h, img_w = target_sizes.unbind(1) scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) boxes = boxes * scale_fct[:, None, :] results = [{ 'scores': s, 'labels': l, 'boxes': b } for s, l, b in zip(scores, labels, boxes)] return results
def select_action(self, observation): state = T.Tensor([observation]).to(self.policy.device) probabilities = F.softmax(self.policy.forward(state[0])) action_probs = T.distributions.Categorical(probabilities) action = action_probs.sample() log_probs = action_probs.log_prob(action) self.action_memory.append(log_probs) return action.item()
def _forward_test(self, cls_logits, bbox_pred): if self.priors is None: self.priors = PriorBox(self.cfg)().to(bbox_pred.device) scores = F.softmax(cls_logits, dim=2) boxes = box_utils.convert_locations_to_boxes( bbox_pred, self.priors, self.cfg.MODEL.CENTER_VARIANCE, self.cfg.MODEL.SIZE_VARIANCE) boxes = box_utils.center_form_to_corner_form(boxes) detections = (scores, boxes) detections = self.post_processor(detections) return detections, {}
def forward(self, x, sampling=True): if self.network is not None: nn_output = self.network.forward(x) else: nn_output = x mean = F.softmax(self.mean_layer.forward(nn_output)) if not sampling: return mean else: z = categorical(mean, temp=self.temp) return mean, z
def get_prediction(self, x_input, hard=True): n_imgs = x_input.shape[0] out_list = list() n_batches = int(np.ceil(n_imgs / 128)) for i in range(n_batches): x = torch.tensor(x_input[i * 128:(i + 1) * 128]).float() x = lib.cuda(x) out = self.forward(x) out = torch.argmax(out, dim=1) if hard else F.softmax(out, dim=1) out_list.extend(out.data.cpu().numpy()) return np.array(out_list)
def train(epoch): for model in models: model.train() optimizer.zero_grad() global rate rate = min((epoch + 1) / epochs, 0.05) encoded_source = encode(source_data, "source") encoded_target = encode(target_data, "target") source_logits = cls_model(encoded_source) # use source classifier loss: cls_loss = loss_func(source_logits, source_data.y) for model in models: for name, param in model.named_parameters(): if "weight" in name: cls_loss = cls_loss + param.mean() * 3e-3 if use_UDAGCN: # use domain classifier loss: source_domain_preds = domain_model(encoded_source) target_domain_preds = domain_model(encoded_target) source_domain_cls_loss = loss_func( source_domain_preds, torch.zeros(source_domain_preds.size(0)).type( torch.LongTensor).to(device)) target_domain_cls_loss = loss_func( target_domain_preds, torch.ones(target_domain_preds.size(0)).type( torch.LongTensor).to(device)) loss_grl = source_domain_cls_loss + target_domain_cls_loss loss = cls_loss + loss_grl # use target classifier loss: target_logits = cls_model(encoded_target) target_probs = F.softmax(target_logits, dim=-1) target_probs = torch.clamp(target_probs, min=1e-9, max=1.0) loss_entropy = torch.mean( torch.sum(-target_probs * torch.log(target_probs), dim=-1)) loss = loss + loss_entropy * (epoch / epochs * 0.01) else: loss = cls_loss optimizer.zero_grad() loss.backward() optimizer.step()
def scaled_dot_product(self, x, Q, K, V): ## bmm은 batch 단위 matmul이고, broadcasting이 지원되지 않는다 ## 사실 matmul과 정확히 어떤 차이인지 잘 모르겠다 tmp = torch.matmul(Q, K) tmp = torch.div(tmp, torch.sqrt(self.d_k)) if self.mask: pass tmp = F.softmax(tmp) tmp = torch.matmul(tmp, V) return tmp
def log_rank_loss(self, y_pos, y_neg, temp=0): M = y_pos.size(0) N = y_neg.size(0) y_pos = self.gamma - y_pos y_neg = self.gamma - y_neg C = int(N / M) y_neg = y_neg.view(C, -1).transpose(0, 1) p = F.softmax(temp * y_neg) loss_pos = torch.sum(F.softplus(-1 * y_pos)) loss_neg = torch.sum(p * F.softplus(y_neg)) loss = (loss_pos + loss_neg) / 2 / M if self.gpu: loss = loss.cuda() return loss
def attention(query, key, value, mask=None, dropout=None): "Compute 'Scaled Dot Product Attention'" d_k = query.size(-1) # noinspection PyUnresolvedReferences scores = torch.matmul(query, key.transpose(-2, -1)) \ / math.sqrt(d_k) if mask is not None: scores = scores.masked_fill(mask == 0, -1e9) # noinspection PyUnresolvedReferences p_attn = F.softmax(scores, dim=-1) if dropout is not None: p_attn = dropout(p_attn) # noinspection PyUnresolvedReferences return torch.matmul(p_attn, value), p_attn
def eval_probs_on_grid(self, extent, res=400): """ Evaluate the ensemble on a res x res grid spanning from [-extent, extent]. :return: Numpy array of probabilities predicted by the model with shape [num_eval_points, num_classes, num_models] """ xrange = (-extent, extent) yrange = (-extent, extent) xx, yy = get_grid(xrange, yrange, res) eval_points = torch.from_numpy( np.stack((xx.ravel(), yy.ravel()), axis=1)) with torch.no_grad(): probs = F.softmax(self(eval_points), dim=1).cpu().numpy() return probs
def forward(self, x_level_0, x_level_1, x_level_2): # Feature Resizing过程 if self.level == 0: level_0_resized = x_level_0 level_1_resized = self.stride_level_1(x_level_1) level_2_downsampled_inter = F.max_pool2d(x_level_2, 3, stride=2, padding=1) level_2_resized = self.stride_level_2(level_2_downsampled_inter) elif self.level == 1: level_0_compressed = self.compress_level_0(x_level_0) level_0_resized = F.interpolate(level_0_compressed, 2, mode='nearest') level_1_resized = x_level_1 level_2_resized = self.stride_level_2(x_level_2) elif self.level == 2: level_0_compressed = self.compress_level_0(x_level_0) level_0_resized = F.interpolate(level_0_compressed, 4, mode='nearest') if self.dim[1] != self.dim[2]: level_1_compressed = self.compress_level_1(x_level_1) level_1_resized = F.interpolate(level_1_compressed, 2, mode='nearest') else: level_1_resized = F.interpolate(x_level_1, 2, mode='nearest') level_2_resized = x_level_2 # 融合权重也是来自于网络学习 level_0_weight_v = self.weight_level_0(level_0_resized) level_1_weight_v = self.weight_level_1(level_1_resized) level_2_weight_v = self.weight_level_2(level_2_resized) levels_weight_v = torch.cat( (level_0_weight_v, level_1_weight_v, level_2_weight_v), 1) levels_weight = self.weight_levels(levels_weight_v) levels_weight = F.softmax(levels_weight, dim=1) # alpha # 自适应融合 fused_out_reduced = level_0_resized * levels_weight[:,0:1,:,:] +\ level_1_resized * levels_weight[:,1:2,:,:] +\ level_2_resized * levels_weight[:,2:,:,:] out = self.expand(fused_out_reduced) return out
def forward(self, output, target): B, C, H, W = output.size() out = output.permute(0, 2, 3, 1).contiguous().view(B, H * W * 5, 5 + 20) xy_pred = torch.sigmoid(out[:, :, 0:2]) conf_pred = torch.sigmoid(out[:, :, 4:5]) hw_pred = torch.exp(out[:, :, 2:4]) class_score = out[:, :, 5:] class_pred = F.softmax(class_score, dim=-1) delta_pred = torch.cat([xy_pred, hw_pred], dim=-1) # output_var = (delta_pred, conf_pred, class_score) output_data = [e.data for e in output_var] #gt_boxes,gt_classes,num_boxes = target target_var = self.build_target(output_data, target, H, W) box_loss, iou_loss, class_loss = self.cal_loss(output_var, target_var) return box_loss, iou_loss, class_loss
def model_test(test_loader, net, num_pos_classes, loss_type=[]): net.eval() with_cuda = True correct = 0 for inputs, labels in test_loader: if with_cuda: inputs, labels = inputs.cuda(), labels.cuda() v_inputs = Variable(inputs) v_labels = Variable(labels) output = net(v_inputs) output_dim = output.data.shape[1] if output_dim != num_pos_classes: if 'max_out' in loss_type: ''' deprecated. call shrink_M_to_1 ''' first_pos = output.index_select( 1, Variable(torch.arange(0, num_pos_classes).long().cuda())) second_neg = output.index_select( 1, Variable( torch.arange(num_pos_classes, output_dim).long().cuda())) second_max_neg, _ = torch.max(second_neg, dim=1, keepdim=True) output = torch.cat((first_pos, second_max_neg), dim=1) else: output = F.softmax(output, dim=1) first_pos = output.index_select( 1, Variable(torch.arange(0, num_pos_classes).long().cuda())) second_neg = output.index_select( 1, Variable( torch.arange(num_pos_classes, output_dim).long().cuda())) prob_neg = torch.sum(second_neg, 1, keepdim=True) output = torch.cat((first_pos, prob_neg), dim=1) pred_idx = output.data.max(1, keepdim=True)[1] correct += pred_idx.eq(labels.view_as(pred_idx)).long().cpu().sum() return 1. * correct / len(test_loader.dataset)
def attention(query, key, value, mask=None, dropout=None): "Compute 'Scaled Dot Product Attention'" """ The two most commonly used attention functions are additive attention, and dot-product(multiplicative) attention. Here we adopt the dot product one, and applied a scaling factor. Additive attention computes the compatibility using a feed-forward network with a single hidden layer. While the two are similar in theoretical complexity, dot-product attention is much faster and more space-efficient in practice, since it can be implemented using highly optimized matrix multiplication code """ d_k = query.size(-1) scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k) if mask is not None: scores = scores.masked_fill(mask == 0, -1e9) p_attn = F.softmax(scores, dim=-1) if dropout is not None: p_attn = dropout(p_attn) return torch.matmul(p_attn, value), p_attn
def top_k_top_p_filtering(logits, top_k=100, top_p=0.95, filter_value=-float('Inf')): """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering Args: logits: logits distribution shape (vocabulary size) top_k > 0: keep only top k tokens with highest probability (top-k filtering). top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 """ top_k = min(top_k, logits.size(-1)) # Safety check if top_k > 0: # Remove all tokens with a probability less than the last token of the top-k indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] logits[indices_to_remove] = filter_value if top_p > 0.0: sorted_logits, sorted_indices = torch.sort(logits, descending=True) cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) # Remove tokens with cumulative probability above the threshold sorted_indices_to_remove = cumulative_probs > top_p # Shift the indices to the right to keep also the first token above the threshold sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[ ..., :-1].clone() sorted_indices_to_remove[..., 0] = 0 # scatter sorted tensors to original indexing indices_to_remove = sorted_indices_to_remove.scatter( dim=1, index=sorted_indices, src=sorted_indices_to_remove) logits[indices_to_remove] = filter_value return logits
def activation(self, x): return (ToXYXY(x[0]), F.softmax(x[1], dim=-1)) def decodes(self, x, pad=True):
def predict(self, x): return torch.mean(F.softmax(self.forward(x), dim=1), dim=2)
state, new_episode=True) state = stacked_state log_probs = [] values = [] advantages = [] rewards = [] entropys = [] critics = [] actors = [] overall_entropy = 0 for t in count(): state = np.reshape(state, (1, 4, 84, 84)) policy, value = actor_critic.forward( torch.from_numpy(state).float().to(device)) probs = Categorical(F.softmax(policy, dim=1)) # value = critic.forward(torch.from_numpy(state).float().to(device)) action = probs.sample() log_prob = probs.log_prob(action) entropy = probs.entropy() log_probs.append(log_prob) values.append(value) entropys.append(entropy) # print("entropy {} and probs {}".format(entropy, probs)) overall_entropy += entropy.item() # step in environment next_state, reward, done, _ = env.step(action) stacked_next_state, stacked_frames = stack_frames( stacked_frames, next_state, new_episode=False) next_state = stacked_next_state next_state = next_state.reshape((1, 4, 84, 84))
def forward(self, pred, target): ''' pred should be the linear output. softmax will be calculated here ''' batch_size = pred.data.size(0) pred = pred.view(batch_size, self.M + self._num_proto * self.N) if self._num_proto > 1: if self._multi_policy_proto == 'max_softmax': first = pred[:, :(self.N * self._num_proto)].contiguous().view( batch_size, self.N, self._num_proto) first_max, _ = torch.max(first, dim=2) second = pred[:, (self.N * self._num_proto):] pred = torch.cat((first_max, second), dim=1) prediction = F.softmax(pred, dim=1) elif self._multi_policy_proto == 'softmax_sum': prediction = F.softmax(pred, dim=1) first = prediction[:, :(self.N * self._num_proto)].contiguous().view( batch_size, self.N, self._num_proto) first_sum = torch.sum(first, dim=2) second = prediction[:, (self.N * self._num_proto):] prediction = torch.cat((first_max, second), dim=1) else: prediction = F.softmax(pred, dim=1) loss = 0 # cross entropy loss loss_ce = 0 if 'cross_entropy' in self.loss_type: prob_N = prediction.index_select( 1, torch.autograd.Variable(torch.arange(0, self.N).long().cuda())) prob_M = prediction.index_select( 1, torch.autograd.Variable( torch.arange(self.N, self.N + self.M).long().cuda())) prob_sM = torch.sum(prob_M, 1, keepdim=True) prob_N1 = torch.cat((prob_N, prob_sM), dim=1) log_prob_N1 = torch.log(prob_N1 + self.eps) loss_ce = F.nll_loss(log_prob_N1, target) loss += loss_ce * self.loss_type.get('cross_entropy', 1) # entropy loss loss_en = 0 if 'entropy_loss' in self.loss_type or \ 'uniform_loss' in self.loss_type: negative_prob_M = prob_M[( target.data == self.N).nonzero().squeeze(1), :] norm_neg_prob_M = negative_prob_M / ( torch.sum(negative_prob_M, dim=1) + self.eps).view( -1, 1).expand_as(negative_prob_M) if 'entropy_loss' in self.loss_type: #loss_en = - torch.mean(torch.sum(norm_neg_prob_M * torch.log(norm_neg_prob_M+ #self.eps), dim=1)) loss_en = -torch.mean( torch.sum(prediction * torch.log(prediction + self.eps), dim=1)) loss += loss_en * self.loss_type.get('entropy_loss', 1) # loss to make sure all loss_uniform = 0 if 'uniform_loss' in self.loss_type: avg_norm_neg_prob_M = torch.mean(norm_neg_prob_M, dim=0) loss_uniform = -torch.mean( torch.log(avg_norm_neg_prob_M + self.eps)) - Variable( torch.log(torch.FloatTensor([self.M]).cuda())) #loss_uniform *= Variable(torch.FloatTensor([0.001]).cuda()) loss += loss_uniform * self.loss_type.get('uniform_loss', 1) if (self._iter % 100) == 0: logging.info( 'loss ce = {}; loss en = {}; loss uniform = {}'.format( loss_ce.data.cpu()[0], loss_en.data.cpu()[0], loss_uniform.data.cpu()[0])) if 'max_out' in self.loss_type: pred_N = pred.index_select( 1, torch.autograd.Variable(torch.arange(0, self.N).long().cuda())) pred_M = pred.index_select( 1, torch.autograd.Variable( torch.arange(self.N, self.N + self.M).long().cuda())) pred_maxM, _ = torch.max(pred_M, dim=1, keepdim=True) pred_NmaxM = torch.cat((pred_N, pred_maxM), dim=1) loss += self._ce(pred_NmaxM, target) self._iter = self._iter + 1 return loss
print(z.grad_fn) x = x.requires_grad_() y = y.requires_grad_() z = x + y print(z) print(z.grad_fn) print(z.requires_grad) print("===========") new_z = z.detach() print(new_z) print(new_z.grad_fn) data = torch.randn(5) print(data) print(F.softmax(data, dim=0)) print(F.softmax(data, dim=0).sum()) print(F.softmax(data, dim=0).sum()) print(F.log_softmax(data, dim=0)) 1
def forward(self, inputs): stacked = torch.stack(inputs, dim=1) weights = F.softmax(self.dense_weight(stacked), dim=1) outputs = torch.sum(stacked * weights, dim=1) return outputs
def forward(self, x): x = F.relu(self.hidden1(x)) x = F.relu(self.hidden2(x)) x = F.relu(self.hidden3(x)) return F.softmax(self.hidden4(x))