def cls_loss(self,gt_label,pred_label): # get the mask element which >= 0, only 0 and 1 can effect the detection loss pred_label = torch.squeeze(pred_label) mask = torch.ge(gt_label,0) valid_gt_label = torch.masked_select(gt_label,mask).float() valid_pred_label = torch.masked_select(pred_label,mask) return self.loss_cls(valid_pred_label,valid_gt_label)
def train_multilabel(features, targets, classes, train_split, test_split, C=1.0, ignore_hard_examples=True, after_ReLU=False, normalize_L2=False): print('\nHyperparameters:\n - C: {}\n - after_ReLU: {}\n - normL2: {}'.format(C, after_ReLU, normalize_L2)) train_APs = [] test_APs = [] for class_id in range(len(classes)): classifier = SVC(C=C, kernel='linear') # http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html if ignore_hard_examples: train_masks = (targets[train_split][:,class_id] != 0).view(-1, 1) train_features = torch.masked_select(features[train_split], train_masks.expand_as(features[train_split])).view(-1,features[train_split].size(1)) train_targets = torch.masked_select(targets[train_split], train_masks.expand_as(targets[train_split])).view(-1,targets[train_split].size(1)) test_masks = (targets[test_split][:,class_id] != 0).view(-1, 1) test_features = torch.masked_select(features[test_split], test_masks.expand_as(features[test_split])).view(-1,features[test_split].size(1)) test_targets = torch.masked_select(targets[test_split], test_masks.expand_as(targets[test_split])).view(-1,targets[test_split].size(1)) else: train_features = features[train_split] train_targets = targets[train_split] test_features = features[test_split] test_targets = features[test_split] if after_ReLU: train_features[train_features < 0] = 0 test_features[test_features < 0] = 0 if normalize_L2: train_norm = torch.norm(train_features, p=2, dim=1).unsqueeze(1) train_features = train_features.div(train_norm.expand_as(train_features)) test_norm = torch.norm(test_features, p=2, dim=1).unsqueeze(1) test_features = test_features.div(test_norm.expand_as(test_features)) train_X = train_features.numpy() train_y = (train_targets[:,class_id] != -1).numpy() # uses hard examples if not ignored test_X = test_features.numpy() test_y = (test_targets[:,class_id] != -1).numpy() classifier.fit(train_X, train_y) # train parameters of the classifier train_preds = classifier.predict(train_X) train_acc = accuracy_score(train_y, train_preds) * 100 train_AP = average_precision_score(train_y, train_preds) * 100 train_APs.append(train_AP) test_preds = classifier.predict(test_X) test_acc = accuracy_score(test_y, test_preds) * 100 test_AP = average_precision_score(test_y, test_preds) * 100 test_APs.append(test_AP) print('class "{}" ({}/{}):'.format(classes[class_id], test_y.sum(), test_y.shape[0])) print(' - {:8}: acc {:.2f}, AP {:.2f}'.format(train_split, train_acc, train_AP)) print(' - {:8}: acc {:.2f}, AP {:.2f}'.format(test_split, test_acc, test_AP)) print('all classes:') print(' - {:8}: mAP {:.4f}'.format(train_split, sum(train_APs)/len(classes))) print(' - {:8}: mAP {:.4f}'.format(test_split, sum(test_APs)/len(classes)))
def compute_accuracy(self, prob_cls, gt_cls): #we only need the detection which >= 0 prob_cls = torch.squeeze(prob_cls) mask = torch.ge(gt_cls, 0) #get valid element valid_gt_cls = torch.masked_select(gt_cls, mask) valid_prob_cls = torch.masked_select(prob_cls, mask) size = min(valid_gt_cls.size()[0], valid_prob_cls.size()[0]) prob_ones = torch.ge(valid_prob_cls, 0.6).float() right_ones = torch.eq(prob_ones, valid_gt_cls.float()).float() return torch.div(torch.mul(torch.sum(right_ones), float(1.0)), float(size))
def compute_stage_loss(criterion, targets, outputs, masks): assert isinstance(outputs, list), 'The ouputs type is wrong : {:}'.format(type(outputs)) total_loss = 0 each_stage_loss = [] for output in outputs: stage_loss = 0 output = torch.masked_select(output , masks) target = torch.masked_select(targets, masks) stage_loss = criterion(output, target) total_loss = total_loss + stage_loss each_stage_loss.append(stage_loss.item()) return total_loss, each_stage_loss
def updateGradInput(self, input, gradOutput): input, mask = input if input.type() == 'torch.cuda.FloatTensor': torch.arange(0, mask.nelement(), out=self._maskIndexBufferCPU).resize_(mask.size()) self._maskIndexBuffer.resize_(self._maskIndexBufferCPU.size()).copy_(self._maskIndexBufferCPU) else: torch.arange(0, mask.nelement(), out=self._maskIndexBuffer).resize_(mask.size()) torch.masked_select(self._maskIndexBuffer, mask, out=self._maskIndices) self._gradBuffer.resize_(input.nelement()).zero_() self._gradBuffer.scatter_(0, self._maskIndices, gradOutput) self._gradBuffer.resize_(input.size()) self.gradInput = [self._gradBuffer, self._gradMask.resize_(mask.size()).fill_(0)] return self.gradInput
def split_on_targets(self, hiddens, targets): # Split the targets into those in the head and in the tail split_targets = [] split_hiddens = [] # Determine to which split each element belongs (for each start split value, add 1 if equal or greater) # This method appears slower at least for WT-103 values for approx softmax #masks = [(targets >= self.splits[idx]).view(1, -1) for idx in range(1, self.nsplits)] #mask = torch.sum(torch.cat(masks, dim=0), dim=0) ### # This is equally fast for smaller splits as method below but scales linearly mask = None for idx in range(1, self.nsplits): partial_mask = targets >= self.splits[idx] mask = mask + partial_mask if mask is not None else partial_mask ### #masks = torch.stack([targets] * (self.nsplits - 1)) #mask = torch.sum(masks >= self.split_starts, dim=0) for idx in range(self.nsplits): # If there are no splits, avoid costly masked select if self.nsplits == 1: split_targets, split_hiddens = [targets], [hiddens] continue # If all the words are covered by earlier targets, we have empties so later stages don't freak out if sum(len(t) for t in split_targets) == len(targets): split_targets.append([]) split_hiddens.append([]) continue # Are you in our split? tmp_mask = mask == idx split_targets.append(torch.masked_select(targets, tmp_mask)) split_hiddens.append(hiddens.masked_select(tmp_mask.unsqueeze(1).expand_as(hiddens)).view(-1, hiddens.size(1))) return split_targets, split_hiddens
def rpn_bbox_loss(target_bbox, rpn_match, rpn_bbox, config): """Return the RPN bounding box loss graph. config: the model config object. target_bbox: [batch, max positive anchors, (dy, dx, log(dh), log(dw))]. Uses 0 padding to fill in unsed bbox deltas. rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive, -1=negative, 0=neutral anchor. rpn_bbox: [batch, anchors, (dy, dx, log(dh), log(dw))] """ # Positive anchors contribute to the loss, but negative and # neutral anchors (match value of 0 or -1) don't. indices = torch.eq(rpn_match, 1) rpn_bbox = torch.masked_select(rpn_bbox, indices) batch_counts = torch.sum(indices.float(), dim=1) outputs = [] for i in range(config.IMAGES_PER_GPU): # print(batch_counts[i].cpu().data.numpy()[0]) outputs.append(target_bbox[i, torch.arange(int(batch_counts[i].cpu().data.numpy()[0])).type(torch.cuda.LongTensor)]) target_bbox = torch.cat(outputs, dim=0) loss = F.smooth_l1_loss(rpn_bbox, target_bbox, size_average=True) return loss
def mrcnn_bbox_loss(target_bbox, target_class_ids, pred_bbox): """Loss for Mask R-CNN bounding box refinement. target_bbox: [batch, num_rois, (dy, dx, log(dh), log(dw))] target_class_ids: [batch, num_rois]. Integer class IDs. pred_bbox: [batch, num_rois, num_classes, (dy, dx, log(dh), log(dw))] """ # Reshape to merge batch and roi dimensions for simplicity. target_class_ids = target_class_ids.contiguous().view(-1) target_bbox = target_bbox.contiguous().view(-1, 4) pred_bbox = pred_bbox.contiguous().view(-1, pred_bbox.size()[2], 4) # print(target_class_ids) # Only positive ROIs contribute to the loss. And only # the right class_id of each ROI. Get their indicies. positive_roi_ix = torch.gt(target_class_ids , 0) # print(positive_roi_ix) positive_roi_class_ids = torch.masked_select(target_class_ids, positive_roi_ix) indices = target_class_ids # indices = torch.stack([positive_roi_ix, positive_roi_class_ids], dim=1) # print(indices) # Gather the deltas (predicted and true) that contribute to loss # target_bbox = torch.gather(target_bbox, positive_roi_ix) # pred_bbox = torch.gather(pred_bbox, indices) loss = F.smooth_l1_loss(pred_bbox, target_bbox, size_average=True) return loss
def forward(self, feat, right, wrong, batch_wrong, fake=None, fake_diff_mask=None): num_wrong = wrong.size(1) batch_size = feat.size(0) feat = feat.view(-1, self.ninp, 1) right_dis = torch.bmm(right.view(-1, 1, self.ninp), feat) wrong_dis = torch.bmm(wrong, feat) batch_wrong_dis = torch.bmm(batch_wrong, feat) wrong_score = torch.sum(torch.exp(wrong_dis - right_dis.expand_as(wrong_dis)),1) \ + torch.sum(torch.exp(batch_wrong_dis - right_dis.expand_as(batch_wrong_dis)),1) loss_dis = torch.sum(torch.log(wrong_score + 1)) loss_norm = right.norm() + feat.norm() + wrong.norm() + batch_wrong.norm() if fake: fake_dis = torch.bmm(fake.view(-1, 1, self.ninp), feat) fake_score = torch.masked_select(torch.exp(fake_dis - right_dis), fake_diff_mask) margin_score = F.relu(torch.log(fake_score + 1) - self.margin) loss_fake = torch.sum(margin_score) loss_dis += loss_fake loss_norm += fake.norm() loss = (loss_dis + 0.1 * loss_norm) / batch_size if fake: return loss, loss_fake.data[0] / batch_size else: return loss
def forward(self, input, target, mask): logprob_select = torch.gather(input, 1, target) out = torch.masked_select(logprob_select, mask) loss = -torch.sum(out) / mask.float().sum() return loss
def plot_clusters(num, e, centers, points, fig, model): plt.figure(0) plt.clf() plt.gca().set_xlim([-0.05,1.05]) plt.gca().set_ylim([-0.05,1.05]) clusters = e[fig].max()+1 colors = cm.rainbow(np.linspace(0,1,clusters)) for i in range(clusters): c = colors[i][:-1] mask = e[fig] == i x = torch.masked_select(points[fig,:,0], mask) y = torch.masked_select(points[fig,:,1], mask) plt.plot(x.cpu().numpy(), y.cpu().numpy(), 'o', c=rgb2hex(c)) if centers is not None: center = centers[i] plt.plot([center.data[0]], [center.data[1]], '*', c=rgb2hex(c)) plt.title('clustering') plt.savefig('./plots/clustering_it_{}_{}.png'.format(num, model))
def forward(self, input, target): logprob_select = torch.gather(input, 1, target) mask = target.data.gt(0) # generate the mask if isinstance(input, Variable): mask = Variable(mask, volatile=input.volatile) out = torch.masked_select(logprob_select, mask) loss = -torch.sum(out) # get the average loss. return loss
def rpn_class_loss(rpn_match, rpn_class_logits): """RPN anchor classifier loss. rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive, -1=negative, 0=neutral anchor. rpn_class_logits: [batch, anchors, 2]. RPN classifier logits for FG/BG. """ # Get anchor classes. Convert the -1/+1 match to 0/1 values. anchor_class = torch.eq(rpn_match, 1) # Positive and Negative anchors contribute to the loss, # but neutral anchors (match value = 0) don't. indices = torch.ne(rpn_match, 0.) rpn_class_logits = torch.masked_select(rpn_class_logits, indices) anchor_class = torch.masked_select(anchor_class, indices) rpn_class_logits = rpn_class_logits.contiguous().view(-1, 2) anchor_class = anchor_class.contiguous().view(-1).type(torch.cuda.LongTensor) loss = F.cross_entropy(rpn_class_logits, anchor_class, weight=None) return loss
def test_net(save_folder, net, cuda, dataset, transform, top_k, im_size=300, thresh=0.05): num_images = len(dataset) # all detections are collected into: # all_boxes[cls][image] = N x 5 array of detections in # (x1, y1, x2, y2, score) all_boxes = [[[] for _ in range(num_images)] for _ in range(len(labelmap)+1)] # timers _t = {'im_detect': Timer(), 'misc': Timer()} output_dir = get_output_dir('ssd300_120000', set_type) det_file = os.path.join(output_dir, 'detections.pkl') for i in range(num_images): im, gt, h, w = dataset.pull_item(i) x = Variable(im.unsqueeze(0)) if args.cuda: x = x.cuda() _t['im_detect'].tic() detections = net(x).data detect_time = _t['im_detect'].toc(average=False) # skip j = 0, because it's the background class for j in range(1, detections.size(1)): dets = detections[0, j, :] mask = dets[:, 0].gt(0.).expand(5, dets.size(0)).t() dets = torch.masked_select(dets, mask).view(-1, 5) if dets.dim() == 0: continue boxes = dets[:, 1:] boxes[:, 0] *= w boxes[:, 2] *= w boxes[:, 1] *= h boxes[:, 3] *= h scores = dets[:, 0].cpu().numpy() cls_dets = np.hstack((boxes.cpu().numpy(), scores[:, np.newaxis])).astype(np.float32, copy=False) all_boxes[j][i] = cls_dets print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1, num_images, detect_time)) with open(det_file, 'wb') as f: pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL) print('Evaluating detections') evaluate_detections(all_boxes, output_dir, dataset)
def forward(self, prob, target, reward): """ Args: prob: (N, C), torch Variable target : (N, ), torch Variable reward : (N, ), torch Variable """ prob=prob.view(-1,prob.size(2)).contiguous() N = target.size(0) C = prob.size(1) one_hot = torch.zeros((N, C)) if prob.is_cuda: one_hot = one_hot.cuda() one_hot.scatter_(1, target.data.view((-1,1)), 1) one_hot = one_hot.type(torch.ByteTensor) one_hot = Variable(one_hot) if prob.is_cuda: one_hot = one_hot.cuda() loss = torch.masked_select(prob, one_hot) loss = loss * reward loss = -torch.sum(loss) return loss
def multiclass_nms(multi_bboxes, multi_scores, score_thr, nms_cfg, max_num=-1, score_factors=None): """NMS for multi-class bboxes. Args: multi_bboxes (Tensor): shape (n, #class*4) or (n, 4) multi_scores (Tensor): shape (n, #class), where the last column contains scores of the background class, but this will be ignored. score_thr (float): bbox threshold, bboxes with scores lower than it will not be considered. nms_thr (float): NMS IoU threshold max_num (int): if there are more than max_num bboxes after NMS, only top max_num will be kept. score_factors (Tensor): The factors multiplied to scores before applying NMS Returns: tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \ are 0-based. """ num_classes = multi_scores.size(1) - 1 # exclude background category if multi_bboxes.shape[1] > 4: bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4) else: bboxes = multi_bboxes[:, None].expand(multi_scores.size(0), num_classes, 4) scores = multi_scores[:, :-1] # filter out boxes with low scores valid_mask = scores > score_thr # We use masked_select for ONNX exporting purpose, # which is equivalent to bboxes = bboxes[valid_mask] # (TODO): as ONNX does not support repeat now, # we have to use this ugly code bboxes = torch.masked_select( bboxes, torch.stack((valid_mask, valid_mask, valid_mask, valid_mask), -1)).view(-1, 4) if score_factors is not None: scores = scores * score_factors[:, None] scores = torch.masked_select(scores, valid_mask) labels = valid_mask.nonzero(as_tuple=False)[:, 1] if bboxes.numel() == 0: bboxes = multi_bboxes.new_zeros((0, 5)) labels = multi_bboxes.new_zeros((0, ), dtype=torch.long) if torch.onnx.is_in_onnx_export(): raise RuntimeError('[ONNX Error] Can not record NMS ' 'as it has not been executed this time') return bboxes, labels dets, keep = batched_nms(bboxes, scores, labels, nms_cfg) if max_num > 0: dets = dets[:max_num] keep = keep[:max_num] return dets, labels[keep]
def perform_qlearning_step(policy_net, target_net, optimizer, replay_buffer, batch_size, gamma, device): """ Perform a deep Q-learning step Parameters ------- policy_net: torch.nn.Module policy Q-network target_net: torch.nn.Module target Q-network optimizer: torch.optim.Adam optimizer replay_buffer: ReplayBuffer replay memory storing transitions batch_size: int size of batch to sample from replay memory gamma: float discount factor used in Q-learning update device: torch.device device on which to the models are allocated Returns ------- float loss value for current learning step """ # 1.1 Sample transitions from replay_buffer obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size) obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = torch.tensor(obs_batch), torch.tensor(act_batch), torch.tensor(rew_batch), torch.tensor(next_obs_batch), torch.tensor(done_mask) # 1.2 Squeeze observations (add a dimension) logging.debug("Shapes: obs_batch=%s, act_batch=%s, rew_batch=%s, next_obs_batch=%s, done_mask=%s" % ( obs_batch.shape, act_batch.shape, rew_batch.shape, next_obs_batch.shape, done_mask.shape)) # 2. Compute Q(s_t, a) # ASSUMING ACTION IS AN INDEX (squeeze makes ((a, b, c)) to (a, b, c) q_batch = policy_net(obs_batch) mask = torch.zeros(q_batch.shape).type(torch.ByteTensor) for idx, a in enumerate(act_batch): mask[idx][a] = 1 q_batch = torch.masked_select(q_batch, mask) # 3. Compute \max_a Q(s_{t+1}, a) for all next states. q_next_batch = target_net(obs_batch) q_next_batch = torch.max(q_next_batch, 1)[0] # 4. Mask next state values where episodes have terminated # Following nature-paper page 7 Algorithm 1 this means replacing q_next_batch with rewards (so zero here) for terminations done_mask = done_mask.type(torch.ByteTensor) q_next_batch.masked_fill(done_mask, 0) # 5. Compute the target q_next_batch *= gamma target = rew_batch + q_next_batch # Reset gradients optimizer.zero_grad() # 6. Compute the loss logging.debug("Targets: %s" % target[:5]) criterion = torch.nn.MSELoss() loss = criterion(target, q_batch) # 7. Calculate the gradients grad = loss.backward() # 8. Clip the gradients torch.nn.utils.clip_grad_value_(policy_net.parameters(), 1) # 9. Optimize the model optimizer.step() return loss.item()
def boolean_mask(self, tensor, mask): mask = self.astensor(mask).type(torch.ByteTensor).cuda() return torch.masked_select(tensor, mask)
def forward_features(self, x): B = x.shape[0] device = x.device outs = [] img = x # stage 1 Unchanged x, (H, W) = self.patch_embed1(x) x = x + self.pos_embed1 x = self.pos_drop1(x) for blk in self.block1: x = blk(x, H, W) # stage 2 y_map, x_map = torch.meshgrid( torch.arange(H, device=device).float() / (H - 1), torch.arange(W, device=device).float() / (W - 1)) xy_map = torch.stack((x_map, y_map), dim=-1) loc = xy_map.reshape(-1, 2)[None, ...].repeat([B, 1, 1]) # split into grid and adaptive tokens pos = torch.arange(x.shape[1], dtype=torch.long, device=x.device) tmp = pos.reshape([H, W]) grid_stride = self.grid_stride pos_grid = tmp[grid_stride // 2:H:grid_stride, grid_stride // 2:W:grid_stride] pos_grid = pos_grid.reshape([-1]) mask = torch.ones(pos.shape, dtype=torch.bool, device=pos.device) mask[pos_grid] = 0 pos_ada = torch.masked_select(pos, mask) x_grid = torch.index_select(x, 1, pos_grid) x_ada = torch.index_select(x, 1, pos_ada) loc_grid = torch.index_select(loc, 1, pos_grid) loc_ada = torch.index_select(loc, 1, pos_ada) x = torch.cat([x_grid, x_ada], 1) loc = torch.cat([loc_grid, loc_ada], 1) N_grid = x_grid.shape[1] if vis: outs.append((x, loc, [H, W])) # stage 2 x, loc = self.down_layers1(x, loc, self.pos_embed2, H, W, self.pos_size, N_grid) # down sample H, W = H // 2, W // 2 for blk in self.block2: x = blk(x, x, loc, H, W) if vis: outs.append((x, loc, [H, W])) # stage 3 x, loc = self.down_layers2(x, loc, self.pos_embed3, H, W, self.pos_size, N_grid) # down sample H, W = H // 2, W // 2 for blk in self.block3: x = blk(x, x, loc, H, W) if vis: outs.append((x, loc, [H, W])) # stage 4 x, loc = self.down_layers3(x, loc, self.pos_embed4, H, W, self.pos_size, N_grid) # down sample H, W = H // 2, W // 2 cls_tokens = self.cls_token.expand(B, -1, -1) x = torch.cat((cls_tokens, x), dim=1) for blk in self.block4: x = blk(x, x, loc, H, W) if vis: outs.append((x, loc, [H, W])) # show_tokens(img, outs, N_grid) if self.num % 1 == 0: show_tokens(img, outs, N_grid) self.num = self.num + 1 x = self.norm(x) return x[:, 0]
def forward(self, gt, pre, pre1, pre2, weight1, bias1, weight2, bias2, feat1, feat2, flag): N = gt.size(0) mask = flag.eq(1) pre_label1 = torch.masked_select(pre1, mask) pre_label1 = pre_label1.view(-1, self.AU_num) pre_label2 = torch.masked_select(pre2, mask) pre_label2 = pre_label2.view(-1, self.AU_num) pre_label = torch.masked_select(pre, mask) pre_label = pre_label.view(-1, self.AU_num) gt = torch.masked_select(gt, mask) gt = gt.view(-1, self.AU_num) if bool(gt.numel()): loss_pred = self.lossfunc(pre_label, gt) loss_pred1 = self.lossfunc(pre_label1, gt) loss_pred2 = self.lossfunc(pre_label2, gt) else: loss_pred = Variable(torch.FloatTensor([0])).cuda() loss_pred1 = Variable(torch.FloatTensor([0])).cuda() loss_pred2 = Variable(torch.FloatTensor([0])).cuda() if self.fusion_mode == 0: loss_BCE = (loss_pred1 + loss_pred2) / 2 else: loss_BCE = loss_pred + (loss_pred1 + loss_pred2) / 2 ############### loss multi-view ######## loss_multi_view = torch.FloatTensor([0]) loss_multi_view = loss_multi_view.cuda() bias1 = bias1.view(self.AU_num, -1) feat1 = torch.cat((weight1, bias1), 1) bias2 = bias2.view(self.AU_num, -1) feat2 = torch.cat((weight2, bias2), 1) tmp = torch.norm(feat1, 2, 1) feat_norm1 = feat1 / tmp.view(self.AU_num, -1) tmp = torch.norm(feat2, 2, 1) feat_norm2 = feat2 / tmp.view(self.AU_num, -1) x = feat_norm1 * feat_norm2 x = torch.sum(x, 1) loss_weight_orth = torch.mean(torch.abs(x)) loss_multi_view = loss_multi_view + loss_weight_orth loss_multi_view = loss_multi_view * self.lambda_multi_view ############ end loss multi-view ####### ################# J-S divergence ################# loss_similar = torch.FloatTensor([0]) loss_similar = loss_similar.cuda() if self.use_web != 0: p1 = self.sigmoid(pre1) log_p1 = self.log_sigmoid(pre1) p2 = self.sigmoid(pre2) log_p2 = self.log_sigmoid(pre2) p = (p1 + p2) / 2 # print(torch.max(p1)); # print(torch.min(p1)); if self.select_sample == 0: mask_idx = torch.ge(p1, -1) elif self.select_sample == 1: mask_idx1 = torch.ge(p1, -1) mask_idx2 = torch.ge(p1, -1) p_scale1 = p1 * p1 + p2 * p2 p_scale2 = (1 - p1) * (1 - p1) + (1 - p2) * (1 - p2) for i in range(0, self.AU_num): r = (1 - self.sample_weight[i]) * ( 1 - self.sample_weight[i]) * 2 * self.sample_scale idx_temp = torch.le(p_scale1[:, i], r) mask_idx1[:, i] = idx_temp r = self.sample_weight[i] * self.sample_weight[ i] * 2 * self.sample_scale idx_temp = torch.le(p_scale2[:, i], r) mask_idx2[:, i] = idx_temp mask_idx = mask_idx1 | mask_idx2 elif self.select_sample == 2: mask_idx1 = torch.ge(p1, -1) mask_idx2 = torch.ge(p1, -1) p_scale1 = (p1 - 1) * (p1 - 1) + p2 * p2 p_scale2 = p1 * p1 + (1 - p2) * (1 - p2) for i in range(0, self.AU_num): r = self.sample_r idx_temp = torch.le(p_scale1[:, i], r) mask_idx1[:, i] = idx_temp idx_temp = torch.le(p_scale2[:, i], r) mask_idx2[:, i] = idx_temp mask_idx = mask_idx1 | mask_idx2 idx1 = torch.le(p1, 1 - self.eps) idx2 = torch.ge(p1, self.eps) idx = idx1 & idx2 & mask_idx tmp_p1 = 1 - p1[idx] + self.eps Hp1 = torch.mean(-(p1[idx] * log_p1[idx] + tmp_p1 * torch.log(tmp_p1))) idx1 = torch.le(p2, 1 - self.eps) idx2 = torch.ge(p2, self.eps) idx = idx1 & idx2 & mask_idx tmp_p2 = 1 - p2[idx] + self.eps Hp2 = torch.mean(-(p2[idx] * log_p2[idx] + tmp_p2 * torch.log(tmp_p2))) idx1 = torch.le(p, 1 - self.eps) idx2 = torch.ge(p, self.eps) idx = idx1 & idx2 & mask_idx tmp_p11 = p[idx] + self.eps tmp_p22 = 1 - p[idx] + self.eps H1 = torch.mean(-(tmp_p11 * torch.log(tmp_p11) + (tmp_p22) * torch.log(tmp_p22))) H2 = (Hp1 + Hp2) / 2 loss_web = torch.abs(H1 - H2) loss_similar = loss_web loss_similar = loss_similar * self.lambda_co_regularization ################# end J-S divergence ################# loss = loss_BCE + loss_multi_view + loss_similar return loss, loss_pred, loss_pred1, loss_pred2, loss_multi_view, loss_similar
def test_net(save_folder, net, cuda, dataset, transform, top_k, im_size=300, thresh=0.05): num_images = len(dataset) # all detections are collected into: # all_boxes[cls][image] = N x 5 array of detections in # (x1, y1, x2, y2, score) all_boxes = [[[] for _ in range(num_images)] for _ in range(len(labelmap) + 1)] # timers _t = {'im_detect': Timer(), 'misc': Timer()} output_dir = get_output_dir('ssd300_120000', set_type) det_file = os.path.join(output_dir, 'detections.pkl') for i in range(num_images): im, gt, h, w = dataset.pull_item(i) # 这里im的颜色偏暗,因为BaseTransform减去了一个mean # im_saver = cv2.resize(im[(a2,a1,0),:,:].permute((a1,a2,0)).numpy(), (w,h)) im_det = dataset.pull_image(i) # print(im_det) # print("======\n") x = Variable(im.unsqueeze(0)) if args.cuda: x = x.cuda() _t['im_detect'].tic() detections = net(x).data detect_time = _t['im_detect'].toc(average=False) # skip j = 0, because it's the background class # // # // # print(detections) for j in range(1, detections.size(1)): dets = detections[0, j, :] mask = dets[:, 0].gt(0.).expand(5, dets.size(0)).t() dets = torch.masked_select(dets, mask).view(-1, 5) if dets.size(0) == 0: continue boxes = dets[:, 1:] boxes[:, 0] *= w boxes[:, 2] *= w boxes[:, 1] *= h boxes[:, 3] *= h # print(boxes) scores = dets[:, 0].cpu().numpy() cls_dets = np.hstack( (boxes.cpu().numpy(), scores[:, np.newaxis])).astype(np.float32, copy=False) all_boxes[j][i] = cls_dets # print(all_boxes) for item in cls_dets: # print(item) # print(item[5]) if item[4] > thresh: # print(item) chinese = labelmap[j - 1] + str(round(item[4], 2)) # print(chinese+'det\n\n') if chinese[0] == '带': chinese = 'P_Battery_Core' + chinese[6:] else: chinese = 'P_Battery_No_Core' + chinese[7:] cv2.rectangle(im_det, (item[0], item[1]), (item[2], item[3]), (0, 0, 255), 2) cv2.putText(im_det, chinese, (int(item[0]), int(item[1]) - 5), 0, 0.6, (0, 0, 255), 2) real = 0 if gt[0][4] == 3: real = 0 else: real = 1 for item in gt: if real == 0: print('this pic dont have the obj:', dataset.ids[i]) break chinese = labelmap[int(item[4])] # print(chinese+'gt\n\n') if chinese[0] == '带': chinese = 'P_Battery_Core' else: chinese = 'P_Battery_No_Core' cv2.rectangle(im_det, (int(item[0] * w), int(item[1] * h)), (int(item[2] * w), int(item[3] * h)), (0, 255, 255), 2) cv2.putText(im_det, chinese, (int(item[0] * w), int(item[1] * h) - 5), 0, 0.6, (0, 255, 255), 2) # print(labelmap[int(item[4])]) # print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1, num_images, detect_time)) # cv2.imwrite('/media/trs2/wuzhangjie/SSD/eval/Xray20190723/Attention/base_battery_core_bs8_V/det_images/{0}_det.jpg'.format(dataset.ids[i]), im_det) # cv2.imwrite('/media/dsg3/shiyufeng/eval/Xray20190723/battery_2cV_version/20epoch_network/{0}_gt.jpg'.format(dataset.ids[i]), im_gt) # cv2.imwrite( '/media/dsg3/husheng/eval/{0}_det.jpg'.format(dataset.ids[i]), im_det) # cv2.imwrite( '/media/dsg3/husheng/eval/{0}_gt.jpg'.format(dataset.ids[i]), im_gt) with open(det_file, 'wb') as f: pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL) # print('Evaluating detections') evaluate_detections(all_boxes, output_dir, dataset)
def test_net(save_folder, net, cuda, testset, transform, max_per_image=300, thresh=0.005): if not os.path.exists(save_folder): os.mkdir(save_folder) # dump predictions and assoc. ground truth to text file for now num_images = len(testset) num_classes = (21, 81)[args.dataset == 'COCO'] all_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)] _t = {'im_detect': Timer(), 'misc': Timer()} det_file = os.path.join(save_folder, 'detections.pkl') if args.retest: f = open(det_file, 'rb') all_boxes = pickle.load(f) print('Evaluating detections') testset.evaluate_detections(all_boxes, save_folder) print('Evalutating done') return for i in range(num_images): img, _, h, w = testset.pull_item(i) scale = torch.Tensor([w, h, w, h]) with torch.no_grad(): # x = transform(img).unsqueeze(0) x = img.unsqueeze(0) if cuda: x = x.cuda() scale = scale.cuda() _t['im_detect'].tic() detections = net(x) # forward pass detections.detach_() detect_time = _t['im_detect'].toc(average=False) # skip j = 0, because it's the background class for j in range(1, detections.size(1)): dets = detections[0, j, :] mask = dets[:, 0].gt(0.).expand(5, dets.size(0)).t() dets = torch.masked_select(dets, mask).view(-1, 5) if dets.size(0) == 0: continue boxes = dets[:, 1:] boxes[:, 0] *= w boxes[:, 2] *= w boxes[:, 1] *= h boxes[:, 3] *= h scores = dets[:, 0].cpu().numpy() cls_dets = np.hstack( (boxes.cpu().numpy(), scores[:, np.newaxis])).astype(np.float32, copy=False) all_boxes[j][i] = cls_dets ''' # boxes, scores = detector.forward(out,priors) detect_time = _t['im_detect'].toc() boxes = boxes[0] scores = scores[0] boxes *= scale boxes = boxes.cpu().numpy() scores = scores.cpu().numpy() # scale each detection back up to the image _t['misc'].tic() for j in range(1, num_classes): inds = np.where(scores[:, j] > thresh)[0] if len(inds) == 0: all_boxes[j][i] = np.empty([0, 5], dtype=np.float32) continue c_bboxes = boxes[inds] c_scores = scores[inds, j] c_dets = np.hstack((c_bboxes, c_scores[:, np.newaxis])).astype( np.float32, copy=False) keep = nms(c_dets, 0.45, force_cpu=args.cpu) c_dets = c_dets[keep, :] all_boxes[j][i] = c_dets if max_per_image > 0: image_scores = np.hstack([all_boxes[j][i][:, -1] for j in range(1,num_classes)]) if len(image_scores) > max_per_image: image_thresh = np.sort(image_scores)[-max_per_image] for j in range(1, num_classes): keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0] all_boxes[j][i] = all_boxes[j][i][keep, :] nms_time = _t['misc'].toc() ''' if i % 20 == 0: print('im_detect: {:d}/{:d} {:.3f}s'.format( i + 1, num_images, detect_time)) _t['im_detect'].clear() with open(det_file, 'wb') as f: pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL) print('Evaluating detections') testset.evaluate_detections(all_boxes, save_folder)
def __call__(self, prediction_labels: torch.Tensor, gold_labels: torch.Tensor, mask: torch.Tensor) -> Dict: """ 计算 metric. 返回的是 F1 字典: {"precision_[tag]": [value], "recall_[tag]" : [value], "f1-measure_[tag]": [value], "precision-overall": [value], "recall-overall": [value], "f1-measure-overall": [value]} 其中的 [tag] 是 span 的 tag, 也就是 "B-[tag]" 中的 "[tag]" :param prediction_labels: 预测的结果, shape: (B, SeqLen) :param gold_labels: 实际的结果, shape: (B, SeqLen) :param mask: 对 predictions 和 gold label 的 mask, shape: (B, SeqLen) :return: 当前的 metric 计算字典结果. """ if prediction_labels.dim() != 2: raise RuntimeError( f"prediction_labels shape 应该是: (B, SeqLen), 现在是:{prediction_labels.size()}" ) if gold_labels.dim() != 2: raise RuntimeError( f"gold_labels shape 应该是: (B, SeqLen), 现在是:{gold_labels.size()}" ) if mask is not None: if mask.dim() != 2: raise RuntimeError( f"mask shape 应该是: (B, SeqLen), 现在是:{mask.size()}") # 转换到 cpu 进行计算 prediction_labels, gold_labels = prediction_labels.detach().cpu( ), gold_labels.detach().cpu() if mask is not None: mask = mask.detach().cpu() else: mask = torch.ones(size=(prediction_labels.size(0), prediction_labels.size(1)), dtype=torch.long).cpu() assert prediction_labels.size() == gold_labels.size(), \ f"prediction_labels.size: {prediction_labels.size()} 与 gold_labels.size: {gold_labels.size()} 不匹配!" assert prediction_labels.size() == mask.size(), \ f"prediction_labels.size: {prediction_labels.size()} 与 mask.size: {mask.size()} 不匹配!" bool_mask = (mask != 0) num_classes = self.label_vocabulary.label_size if (torch.masked_select(gold_labels, bool_mask) >= num_classes).any(): raise RuntimeError(f"gold_labels 中存在比 num_classes 大的数值") # 将预测的结果 decode 成 span list prediction_spans_list = BIO.decode_label_index_to_span( batch_sequence_label_index=prediction_labels, mask=mask, vocabulary=self.label_vocabulary) # 将gold label index decode 成 span list gold_spans_list = BIO.decode_label_index_to_span( batch_sequence_label_index=gold_labels, mask=mask, vocabulary=self.label_vocabulary) # 预测的 每个 label 的 span 数量字典 num_prediction = defaultdict(int) # golden 每一个 label 的 span num_golden = defaultdict(int) # 当前 batch 下的 true_positives true_positives = defaultdict(int) false_positives = defaultdict(int) false_negatives = defaultdict(int) for prediction_spans, gold_spans in zip(prediction_spans_list, gold_spans_list): intersection = BIO.span_intersection(span_list1=prediction_spans, span_list2=gold_spans) for span in intersection: # self._true_positives[span["label"]] += 1 true_positives[span["label"]] += 1 for span in prediction_spans: num_prediction[span["label"]] += 1 for span in gold_spans: num_golden[span["label"]] += 1 for label, num in num_prediction.items(): false_positives[label] = num - true_positives[label] for label, num in num_golden.items(): false_negatives[label] = num - true_positives[label] for k, v in true_positives.items(): self._true_positives[k] += v for k, v in false_positives.items(): self._false_positives[k] += v for k, v in false_negatives.items(): self._false_negatives[k] += v return self._metric(true_positives=true_positives, false_positives=false_positives, false_negatives=false_negatives)
def do_train(train_loader, model, criterion, optimizer, epoch, args): batch_time = utils.AverageMeter('Time', ':6.3f') data_time = utils.AverageMeter('Data', ':6.3f') losses = utils.AverageMeter('Loss', ':.3f') top1 = utils.AverageMeter('Acc@1', ':6.2f') top5 = utils.AverageMeter('Acc@5', ':6.2f') learning_rate = utils.AverageMeter('LR', ':.4f') losses_id = utils.AverageMeter('L_ID', ':.3f') losses_mag = utils.AverageMeter('L_mag', ':.6f') progress_template = [ batch_time, data_time, losses, losses_id, losses_mag, top1, top5, learning_rate ] progress = utils.ProgressMeter(len(train_loader), progress_template, prefix="Epoch: [{}]".format(epoch)) end = time.time() # update lr learning_rate.update(current_lr) for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) global iters iters += 1 input = input.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # compute output output, x_norm = model(input, target) loss_id, loss_g, one_hot = criterion(output, target, x_norm) loss = loss_id + args.lambda_g * loss_g # measure accuracy and record loss acc1, acc5 = utils.accuracy(args, output[0], target, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(acc1[0], input.size(0)) top5.update(acc5[0], input.size(0)) losses_id.update(loss_id.item(), input.size(0)) losses_mag.update(args.lambda_g * loss_g.item(), input.size(0)) # compute gradient and do solver step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) if args.vis_mag: if (i > 10000) and (i % 100 == 0): x_norm = x_norm.detach().cpu().numpy() cos_theta = torch.masked_select( output[0], one_hot.bool()).detach().cpu().numpy() logit = torch.masked_select(F.softmax( output[0]), one_hot.bool()).detach().cpu().numpy() np.savez( '{}/vis/epoch_{}_iter{}'.format(args.pth_save_fold, epoch, i), x_norm, logit, cos_theta)
def batch_select(mat, idx): mask = torch.arange(mat.size(1)).expand_as(mat).to(mat.device, dtype=torch.long) mask = (mask == idx.view(-1, 1)) return torch.masked_select(mat, mask)
# # 2. Indexing, Slicing, Joining, Reshaping # # 1) Indexing x = torch.rand(4, 3) # torch.index_select out = torch.index_select(x, 0, torch.LongTensor([0, 3])) # print(x, out) # pythonic indexing x[:, 0], x[0, :], x[0:2, 0:2] # torch.masked_select x = torch.randn(2, 3) mask = torch.ByteTensor([[0, 0, 1], [0, 1, 0]]) out = torch.masked_select(x, mask) # x, mask, out # 2) Joining # torch.cat(seq, dim=0) concatenate tensor along dim # 1 2 3 # 4 5 6 x = torch.FloatTensor([[1, 2, 3], [4, 5, 6]]) # -1 -2 -3 # -4 -5 -6 y = torch.FloatTensor([[-1, -2, -3], [-4, -5, -6]]) # 1 2 3 # 4 5 6 # -1 -2 -3
def forward(self, inputs, targets, nonorm): n = inputs.size(0) sim_mat = torch.matmul(inputs, inputs.t()) targets = targets base = 0.5 loss = list() c = 0 for i in range(n): pos_pair_ = torch.masked_select(sim_mat[i], targets == targets[i]) # move itself pos_pair_ = torch.masked_select(pos_pair_, pos_pair_ < self.pos_margin) neg_pair_ = torch.masked_select(sim_mat[i], targets != targets[i]) if self.sample_method is not None: # pos_pair_ = torch.masked_select(pos_pair_, pos_pair_ < self.pos_margin) neg_pair_ = neg_pair_[neg_pair_ + 0.5 > min(pos_pair_)] neg_pair_ = torch.masked_select(neg_pair_, neg_pair_ > self.neg_margin) pos_pair_ = torch.sort(pos_pair_)[0] neg_pair_ = torch.sort(neg_pair_)[0] if self.Dynamic_margin is not None: pos_pair = pos_pair_ neg_pair = neg_pair_ pos_loss = 1.0 / 2 * torch.log(1 + torch.sum( torch.exp(-2 * (pos_pair - 0.5) + self.epoch_num / 300 * (pos_pair - self.pos_margin)**2))) neg_loss = 1.0 / 50 * torch.log(1 + torch.sum( torch.exp(50 * (neg_pair - 0.5) + self.epoch_num / 300 * (self.neg_margin - neg_pair)**2))) else: pos_pair = pos_pair_ neg_pair = neg_pair_ pos_loss = 1.0 / 2 * torch.log( 1 + torch.sum(torch.exp(-2 * (pos_pair - 0.5)))) neg_loss = 1.0 / 50 * torch.log( 1 + torch.sum(torch.exp(50 * (neg_pair - 0.5)))) if len(neg_pair) == 0: c += 1 continue loss.append(pos_loss + neg_loss) loss = sum(loss) / n prec = float(c) / n mean_neg_sim = torch.mean(neg_pair_).item() mean_pos_sim = torch.mean(pos_pair_).item() return loss, prec, mean_pos_sim, mean_neg_sim
def __init__(self, tensor): self.floating_dtype = tensor.dtype.is_floating_point self.int_mode = True self.sci_mode = False self.max_width = 1 with torch.no_grad(): tensor_view = tensor.reshape(-1) if not self.floating_dtype: for value in tensor_view: value_str = '{}'.format(value) self.max_width = max(self.max_width, len(value_str)) else: nonzero_finite_vals = torch.masked_select( tensor_view, torch.isfinite(tensor_view) & tensor_view.ne(0)) if nonzero_finite_vals.numel() == 0: # no valid number, do nothing return # Convert to double for easy calculation. HalfTensor overflows with 1e8, and there's no div() on CPU. nonzero_finite_abs = nonzero_finite_vals.abs().double() nonzero_finite_min = nonzero_finite_abs.min().double() nonzero_finite_max = nonzero_finite_abs.max().double() for value in nonzero_finite_vals: if value != torch.ceil(value): self.int_mode = False break if self.int_mode: # in int_mode for floats, all numbers are integers, and we append a decimal to nonfinites # to indicate that the tensor is of floating type. add 1 to the len to account for this. if nonzero_finite_max / nonzero_finite_min > 1000. or nonzero_finite_max > 1.e8: self.sci_mode = True for value in nonzero_finite_vals: value_str = ('{{:.{}e}}').format( PRINT_OPTS.precision).format(value) self.max_width = max(self.max_width, len(value_str)) else: for value in nonzero_finite_vals: value_str = ('{:.0f}').format(value) self.max_width = max(self.max_width, len(value_str) + 1) else: # Check if scientific representation should be used. if nonzero_finite_max / nonzero_finite_min > 1000.\ or nonzero_finite_max > 1.e8\ or nonzero_finite_min < 1.e-4: self.sci_mode = True for value in nonzero_finite_vals: value_str = ('{{:.{}e}}').format( PRINT_OPTS.precision).format(value) self.max_width = max(self.max_width, len(value_str)) else: for value in nonzero_finite_vals: value_str = ('{{:.{}f}}').format( PRINT_OPTS.precision).format(value) self.max_width = max(self.max_width, len(value_str)) if PRINT_OPTS.sci_mode is not None: self.sci_mode = PRINT_OPTS.sci_mode
def generate( self, models, sample, prefix_tokens=None, bos_token=None, **kwargs ): """Generate a batch of translations. Args: models (List[~fairseq.models.FairseqModel]): ensemble of models sample (dict): batch prefix_tokens (torch.LongTensor, optional): force decoder to begin with these tokens """ model = EnsembleModel(models) if not self.retain_dropout: model.eval() # model.forward normally channels prev_output_tokens into the decoder # separately, but SequenceGenerator directly calls model.encoder encoder_input = { k: v for k, v in sample['net_input'].items() if k != 'prev_output_tokens' } src_tokens = encoder_input['src_tokens'] src_lengths = (src_tokens.ne(self.eos) & src_tokens.ne(self.pad)).long().sum(dim=1) input_size = src_tokens.size() # batch dimension goes first followed by source lengths bsz = input_size[0] src_len = input_size[1] beam_size = self.beam_size if self.match_source_len: max_len = src_lengths.max().item() else: max_len = min( int(self.max_len_a * src_len + self.max_len_b), # exclude the EOS marker model.max_decoder_positions() - 1, ) # compute the encoder output for each beam encoder_outs = model.forward_encoder(encoder_input) self.encoder_input = encoder_input new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1) new_order = new_order.to(src_tokens.device).long() encoder_outs = model.reorder_encoder_out(encoder_outs, new_order) # print('first....................................................................') model.reorder_encoder_input(self.encoder_input, new_order) # initialize buffers scores = src_tokens.new(bsz * beam_size, max_len + 1).float().fill_(0) scores_buf = scores.clone() tokens = src_tokens.data.new(bsz * beam_size, max_len + 2).long().fill_(self.pad) tokens_buf = tokens.clone() tokens[:, 0] = bos_token or self.eos attn, attn_buf = None, None nonpad_idxs = None # The blacklist indicates candidates that should be ignored. # For example, suppose we're sampling and have already finalized 2/5 # samples. Then the blacklist would mark 2 positions as being ignored, # so that we only finalize the remaining 3 samples. blacklist = src_tokens.new_zeros(bsz, beam_size).eq(-1) # forward and backward-compatible False mask # list of completed sentences finalized = [[] for i in range(bsz)] finished = [False for i in range(bsz)] worst_finalized = [{'idx': None, 'score': -math.inf} for i in range(bsz)] num_remaining_sent = bsz # number of candidate hypos per step cand_size = 2 * beam_size # 2 x beam size in case half are EOS # offset arrays for converting between different indexing schemes bbsz_offsets = (torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens) cand_offsets = torch.arange(0, cand_size).type_as(tokens) # helper function for allocating buffers on the fly buffers = {} def buffer(name, type_of=tokens): # noqa if name not in buffers: buffers[name] = type_of.new() return buffers[name] def is_finished(sent, step, unfin_idx, unfinalized_scores=None): """ Check whether we've finished generation for a given sentence, by comparing the worst score among finalized hypotheses to the best possible score among unfinalized hypotheses. """ assert len(finalized[sent]) <= beam_size if len(finalized[sent]) == beam_size: if self.stop_early or step == max_len or unfinalized_scores is None: return True # stop if the best unfinalized score is worse than the worst # finalized one best_unfinalized_score = unfinalized_scores[unfin_idx].max() if self.normalize_scores: best_unfinalized_score /= max_len ** self.len_penalty if worst_finalized[sent]['score'] >= best_unfinalized_score: return True return False def finalize_hypos(step, bbsz_idx, eos_scores, unfinalized_scores=None): """ Finalize the given hypotheses at this step, while keeping the total number of finalized hypotheses per sentence <= beam_size. Note: the input must be in the desired finalization order, so that hypotheses that appear earlier in the input are preferred to those that appear later. Args: step: current time step bbsz_idx: A vector of indices in the range [0, bsz*beam_size), indicating which hypotheses to finalize eos_scores: A vector of the same size as bbsz_idx containing scores for each hypothesis unfinalized_scores: A vector containing scores for all unfinalized hypotheses """ assert bbsz_idx.numel() == eos_scores.numel() # clone relevant token and attention tensors tokens_clone = tokens.index_select(0, bbsz_idx) tokens_clone = tokens_clone[:, 1:step + 2] # skip the first index, which is EOS tokens_clone[:, step] = self.eos attn_clone = attn.index_select(0, bbsz_idx)[:, :, 1:step+2] if attn is not None else None # compute scores per token position pos_scores = scores.index_select(0, bbsz_idx)[:, :step+1] pos_scores[:, step] = eos_scores # convert from cumulative to per-position scores pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1] # normalize sentence-level scores if self.normalize_scores: eos_scores /= (step + 1) ** self.len_penalty cum_unfin = [] prev = 0 for f in finished: if f: prev += 1 else: cum_unfin.append(prev) sents_seen = set() for i, (idx, score) in enumerate(zip(bbsz_idx.tolist(), eos_scores.tolist())): unfin_idx = idx // beam_size sent = unfin_idx + cum_unfin[unfin_idx] sents_seen.add((sent, unfin_idx)) if self.match_source_len and step > src_lengths[unfin_idx]: score = -math.inf def get_hypo(): if attn_clone is not None: # remove padding tokens from attn scores hypo_attn = attn_clone[i][nonpad_idxs[sent]] _, alignment = hypo_attn.max(dim=0) else: hypo_attn = None alignment = None return { 'tokens': tokens_clone[i], 'score': score, 'attention': hypo_attn, # src_len x tgt_len 'alignment': alignment, 'positional_scores': pos_scores[i], } if len(finalized[sent]) < beam_size: finalized[sent].append(get_hypo()) elif not self.stop_early and score > worst_finalized[sent]['score']: # replace worst hypo for this sentence with new/better one worst_idx = worst_finalized[sent]['idx'] if worst_idx is not None: finalized[sent][worst_idx] = get_hypo() # find new worst finalized hypo for this sentence idx, s = min(enumerate(finalized[sent]), key=lambda r: r[1]['score']) worst_finalized[sent] = { 'score': s['score'], 'idx': idx, } newly_finished = [] for sent, unfin_idx in sents_seen: # check termination conditions for this sentence if not finished[sent] and is_finished(sent, step, unfin_idx, unfinalized_scores): finished[sent] = True newly_finished.append(unfin_idx) return newly_finished reorder_state = None batch_idxs = None for step in range(max_len + 1): # one extra step for EOS marker # reorder decoder internal states based on the prev choice of beams if reorder_state is not None: if batch_idxs is not None: # update beam indices to take into account removed sentences corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as(batch_idxs) reorder_state.view(-1, beam_size).add_(corr.unsqueeze(-1) * beam_size) model.reorder_incremental_state(reorder_state) model.reorder_encoder_out(encoder_outs, reorder_state) model.reorder_encoder_input(self.encoder_input, reorder_state) lprobs, avg_attn_scores = model.forward_decoder( tokens[:, :step + 1], encoder_outs, temperature=self.temperature, sample=self.encoder_input ) lprobs[:, self.pad] = -math.inf # never select pad lprobs[:, self.unk] -= self.unk_penalty # apply unk penalty if self.no_repeat_ngram_size > 0: # for each beam and batch sentence, generate a list of previous ngrams gen_ngrams = [{} for bbsz_idx in range(bsz * beam_size)] for bbsz_idx in range(bsz * beam_size): gen_tokens = tokens[bbsz_idx].tolist() for ngram in zip(*[gen_tokens[i:] for i in range(self.no_repeat_ngram_size)]): gen_ngrams[bbsz_idx][tuple(ngram[:-1])] = \ gen_ngrams[bbsz_idx].get(tuple(ngram[:-1]), []) + [ngram[-1]] # Record attention scores if avg_attn_scores is not None: if attn is None: attn = scores.new(bsz * beam_size, src_tokens.size(1), max_len + 2) attn_buf = attn.clone() nonpad_idxs = src_tokens.ne(self.pad) attn[:, :, step + 1].copy_(avg_attn_scores) scores = scores.type_as(lprobs) scores_buf = scores_buf.type_as(lprobs) eos_bbsz_idx = buffer('eos_bbsz_idx') eos_scores = buffer('eos_scores', type_of=scores) if step < max_len: self.search.set_src_lengths(src_lengths) if self.no_repeat_ngram_size > 0: def calculate_banned_tokens(bbsz_idx): # before decoding the next token, prevent decoding of ngrams that have already appeared ngram_index = tuple(tokens[bbsz_idx, step + 2 - self.no_repeat_ngram_size:step + 1].tolist()) return gen_ngrams[bbsz_idx].get(ngram_index, []) if step + 2 - self.no_repeat_ngram_size >= 0: # no banned tokens if we haven't generated no_repeat_ngram_size tokens yet banned_tokens = [calculate_banned_tokens(bbsz_idx) for bbsz_idx in range(bsz * beam_size)] else: banned_tokens = [[] for bbsz_idx in range(bsz * beam_size)] for bbsz_idx in range(bsz * beam_size): lprobs[bbsz_idx, banned_tokens[bbsz_idx]] = -math.inf if prefix_tokens is not None and step < prefix_tokens.size(1): probs_slice = lprobs.view(bsz, -1, lprobs.size(-1))[:, 0, :] cand_scores = torch.gather( probs_slice, dim=1, index=prefix_tokens[:, step].view(-1, 1) ).view(-1, 1).repeat(1, cand_size) if step > 0: # save cumulative scores for each hypothesis cand_scores.add_(scores[:, step - 1].view(bsz, beam_size).repeat(1, 2)) cand_indices = prefix_tokens[:, step].view(-1, 1).repeat(1, cand_size) cand_beams = torch.zeros_like(cand_indices) # handle prefixes of different lengths partial_prefix_mask = prefix_tokens[:, step].eq(self.pad) if partial_prefix_mask.any(): partial_scores, partial_indices, partial_beams = self.search.step( step, lprobs.view(bsz, -1, self.vocab_size), scores.view(bsz, beam_size, -1)[:, :, :step], ) cand_scores[partial_prefix_mask] = partial_scores[partial_prefix_mask] cand_indices[partial_prefix_mask] = partial_indices[partial_prefix_mask] cand_beams[partial_prefix_mask] = partial_beams[partial_prefix_mask] else: cand_scores, cand_indices, cand_beams = self.search.step( step, lprobs.view(bsz, -1, self.vocab_size), scores.view(bsz, beam_size, -1)[:, :, :step], ) else: # make probs contain cumulative scores for each hypothesis lprobs.add_(scores[:, step - 1].unsqueeze(-1)) # finalize all active hypotheses once we hit max_len # pick the hypothesis with the highest prob of EOS right now torch.sort( lprobs[:, self.eos], descending=True, out=(eos_scores, eos_bbsz_idx), ) num_remaining_sent -= len(finalize_hypos(step, eos_bbsz_idx, eos_scores)) assert num_remaining_sent == 0 break # cand_bbsz_idx contains beam indices for the top candidate # hypotheses, with a range of values: [0, bsz*beam_size), # and dimensions: [bsz, cand_size] cand_bbsz_idx = cand_beams.add(bbsz_offsets) # finalize hypotheses that end in eos eos_mask = cand_indices.eq(self.eos) finalized_sents = set() if step >= self.min_len: # only consider eos when it's among the top beam_size indices torch.masked_select( cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size], out=eos_bbsz_idx, ) if eos_bbsz_idx.numel() > 0: torch.masked_select( cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size], out=eos_scores, ) finalized_sents = finalize_hypos(step, eos_bbsz_idx, eos_scores, cand_scores) num_remaining_sent -= len(finalized_sents) assert num_remaining_sent >= 0 if num_remaining_sent == 0: break assert step < max_len if len(finalized_sents) > 0: new_bsz = bsz - len(finalized_sents) # construct batch_idxs which holds indices of batches to keep for the next pass batch_mask = cand_indices.new_ones(bsz) batch_mask[cand_indices.new(finalized_sents)] = 0 batch_idxs = batch_mask.nonzero().squeeze(-1) eos_mask = eos_mask[batch_idxs] cand_beams = cand_beams[batch_idxs] bbsz_offsets.resize_(new_bsz, 1) cand_bbsz_idx = cand_beams.add(bbsz_offsets) cand_scores = cand_scores[batch_idxs] cand_indices = cand_indices[batch_idxs] if prefix_tokens is not None: prefix_tokens = prefix_tokens[batch_idxs] src_lengths = src_lengths[batch_idxs] scores = scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) scores_buf.resize_as_(scores) tokens = tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) tokens_buf.resize_as_(tokens) if attn is not None: attn = attn.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, attn.size(1), -1) attn_buf.resize_as_(attn) bsz = new_bsz else: batch_idxs = None # Set active_mask so that values > cand_size indicate eos or # blacklisted hypos and values < cand_size indicate candidate # active hypos. After this, the min values per row are the top # candidate active hypos. active_mask = buffer('active_mask') torch.add( eos_mask.type_as(cand_offsets) * cand_size, cand_offsets[:eos_mask.size(1)], out=active_mask, ) # get the top beam_size active hypotheses, which are just the hypos # with the smallest values in active_mask active_hypos, _ignore = buffer('active_hypos'), buffer('_ignore') torch.topk( active_mask, k=beam_size, dim=1, largest=False, out=(_ignore, active_hypos) ) active_bbsz_idx = buffer('active_bbsz_idx') torch.gather( cand_bbsz_idx, dim=1, index=active_hypos, out=active_bbsz_idx, ) active_scores = torch.gather( cand_scores, dim=1, index=active_hypos, out=scores[:, step].view(bsz, beam_size), ) active_bbsz_idx = active_bbsz_idx.view(-1) active_scores = active_scores.view(-1) # copy tokens and scores for active hypotheses torch.index_select( tokens[:, :step + 1], dim=0, index=active_bbsz_idx, out=tokens_buf[:, :step + 1], ) torch.gather( cand_indices, dim=1, index=active_hypos, out=tokens_buf.view(bsz, beam_size, -1)[:, :, step + 1], ) if step > 0: torch.index_select( scores[:, :step], dim=0, index=active_bbsz_idx, out=scores_buf[:, :step], ) torch.gather( cand_scores, dim=1, index=active_hypos, out=scores_buf.view(bsz, beam_size, -1)[:, :, step], ) # copy attention for active hypotheses if attn is not None: torch.index_select( attn[:, :, :step + 2], dim=0, index=active_bbsz_idx, out=attn_buf[:, :, :step + 2], ) # swap buffers tokens, tokens_buf = tokens_buf, tokens scores, scores_buf = scores_buf, scores if attn is not None: attn, attn_buf = attn_buf, attn # reorder incremental state in decoder reorder_state = active_bbsz_idx # sort by score descending for sent in range(len(finalized)): finalized[sent] = sorted(finalized[sent], key=lambda r: r['score'], reverse=True) return finalized
def ROIAlign(feature_maps, rois, config, pool_size, mode='bilinear'): """Implements ROI Align on the features. Params: - pool_shape: [height, width] of the output pooled regions. Usually [7, 7] - image_shape: [height, width, chanells]. Shape of input image in pixels Inputs: - boxes: [batch, num_boxes, (x1, y1, x2, y2)] in normalized coordinates. Possibly padded with zeros if not enough boxes to fill the array. - Feature maps: List of feature maps from different levels of the pyramid. Each is [batch, channels, height, width] Output: Pooled regions in the shape: [batch, num_boxes, height, width, channels]. The width and height are those specific in the pool_shape in the layer constructor. """ """ [ x2-x1 x1 + x2 - W + 1 ] [ ----- 0 --------------- ] [ W - 1 W - 1 ] [ ] [ y2-y1 y1 + y2 - H + 1 ] [ 0 ----- --------------- ] [ H - 1 H - 1 ] """ #feature_maps= [P2, P3, P4, P5] rois = rois.detach() crop_resize = CropAndResize(pool_size, pool_size, 0) roi_number = rois.size()[1] pooled = rois.data.new( config.IMAGES_PER_GPU*rois.size( 1), 256, pool_size, pool_size).zero_() rois = rois.view( config.IMAGES_PER_GPU*rois.size(1), 4) # Loop through levels and apply ROI pooling to each. P2 to P5. x_1 = rois[:, 0] y_1 = rois[:, 1] x_2 = rois[:, 2] y_2 = rois[:, 3] roi_level = log2_graph( torch.div(torch.sqrt((y_2 - y_1) * (x_2 - x_1)), 224.0)) roi_level = torch.clamp(torch.clamp( torch.add(torch.round(roi_level), 4), min=2), max=5) # P2 is 256x256, P3 is 128x128, P4 is 64x64, P5 is 32x32 # P2 is 4, P3 is 8, P4 is 16, P5 is 32 for i, level in enumerate(range(2, 6)): scaling_ratio = 2**level height = float(config.IMAGE_MAX_DIM)/ scaling_ratio width = float(config.IMAGE_MAX_DIM) / scaling_ratio ixx = torch.eq(roi_level, level) box_indices = ixx.view(-1).int() * 0 ix = torch.unsqueeze(ixx, 1) level_boxes = torch.masked_select(rois, ix) try: if level_boxes.size()[0] == 0: continue except: continue level_boxes = level_boxes.view(-1, 4) crops = crop_resize(feature_maps[i], torch.div( level_boxes, float(config.IMAGE_MAX_DIM) )[:, [1, 0, 3, 2]], box_indices) indices_pooled = ixx.nonzero()[:, 0] pooled[indices_pooled.data, :, :, :] = crops.data pooled = pooled.view(config.IMAGES_PER_GPU, roi_number, 256, pool_size, pool_size) pooled = Variable(pooled).cuda() return pooled
def REINFORCE(training_pairs, policy_nn, optimizer, num_episodes, relation=None): f = open(graphpath) content = f.readlines() f.close() kb = KB() for line in content: ent1, rel, ent2 = line.rsplit() kb.addRelation(ent1, rel, ent2) # Each line is a triple, represented with strings instead of numbers dropout = nn.Dropout(dynamic_action_dropout_rate) train = training_pairs success = 0 path_found = set() path_found_entity = [] path_relation_found = [] success_cnt_list = [] env = Env(dataPath, train[0], model=args.model) # Initialize the environment for i_episode in range(num_episodes): # for i_episode in range(15): start = time.time() print ('Episode %d' % i_episode) sample = train[random.choice(range(len(training_pairs)))] print ('Training sample: ', sample[:-1]) if relation is None: env = Env(dataPath, sample, args.model) else: env.path = [] env.path_relations = [] sample = sample.split() state_idx = [env.entity2id_[sample[0]], env.entity2id_[sample[1]], 0] episode = [] state_batch_negative = [] lstm_input_batch_negative = [] hidden_batch_negative = [] cell_batch_negative = [] action_batch_negative = [] now_embedding_batch_negative = [] neighbour_embeddings_list_batch_negative = [] state_batch_positive = [] lstm_input_batch_positive = [] hidden_batch_positive = [] cell_batch_positive = [] action_batch_positive = [] now_embedding_batch_positive = [] neighbour_embeddings_list_batch_positive = [] hidden_this_time = torch.zeros(3, 1, hidden_dim) cell_this_time = torch.zeros(3, 1, hidden_dim) if USE_CUDA: hidden_this_time = hidden_this_time.cuda() cell_this_time = cell_this_time.cuda() forward_node_list = [] for t in count(): # for t in range(10): state_vec = floatTensor(env.idx_state(state_idx)) state = torch.cat([state_vec, hidden_this_time[-1]], dim=1) # Only use the last layer's output lstm_input = state_vec.unsqueeze(1) now_embedding = floatTensor(env.entity2vec[[state_idx[0]]]) connected_node_list = [] if state_idx[0] in env.entity2link: for rel in env.entity2link[state_idx[0]]: connected_node_list.extend(env.entity2link[state_idx[0]][rel]) connected_node_list = list(set(connected_node_list)) if len(connected_node_list) == 0: neighbour_embeddings_list = [torch.zeros(1, embedding_dim).cuda() if USE_CUDA else torch.zeros(1, embedding_dim)] else: neighbour_embeddings_list = [floatTensor(env.entity2vec[connected_node_list])] action_probs, lstm_output, hidden_new, cell_new = policy_nn(state, lstm_input, hidden_this_time, cell_this_time, now_embedding, neighbour_embeddings_list) # Action Dropout dropout_action_probs = dropout(action_probs) # print(dropout_action_probs.shape) probability = np.squeeze(dropout_action_probs.cpu().detach().numpy()) probability = probability / sum(probability) action_chosen = np.random.choice(np.arange(action_space), p = probability) reward, new_state, done = env.interact(state_idx, action_chosen) if reward == -1: # the action fails for this step state_batch_negative.append(state) lstm_input_batch_negative.append(lstm_input) hidden_batch_negative.append(hidden_this_time) cell_batch_negative.append(cell_this_time) action_batch_negative.append(action_chosen) now_embedding_batch_negative.append(now_embedding) neighbour_embeddings_list_batch_negative.append(neighbour_embeddings_list[0]) # Force to choose a valid action to go forward try: valid_action_list = list(env.entity2link[state_idx[0]].keys()) probability = probability[valid_action_list] # print("Line 288: ", sum(probability)) probability = probability / sum(probability) # print("Line 288: ", probability) valid_action_chosen = np.random.choice(valid_action_list, p = probability) valid_reward, valid_new_state, valid_done = env.interact(state_idx, valid_action_chosen) reward, new_state, done = valid_reward, valid_new_state, valid_done if new_state == None: forward_node_list.append(env.entity2id_[sample[1]]) # The right tail entity else: forward_node_list.append(new_state[0]) state_batch_positive.append(state) lstm_input_batch_positive.append(lstm_input) hidden_batch_positive.append(hidden_this_time) cell_batch_positive.append(cell_this_time) action_batch_positive.append(valid_action_chosen) now_embedding_batch_positive.append(now_embedding) neighbour_embeddings_list_batch_positive.append(neighbour_embeddings_list[0]) hidden_this_time = hidden_new cell_this_time = cell_new except: print("Cannot find a valid action!") else: # the action find a path that can forward if new_state == None: forward_node_list.append(env.entity2id_[sample[1]]) # The right tail entity else: forward_node_list.append(new_state[0]) state_batch_positive.append(state) lstm_input_batch_positive.append(lstm_input) hidden_batch_positive.append(hidden_this_time) cell_batch_positive.append(cell_this_time) action_batch_positive.append(action_chosen) now_embedding_batch_positive.append(now_embedding) neighbour_embeddings_list_batch_positive.append(neighbour_embeddings_list[0]) hidden_this_time = hidden_new cell_this_time = cell_new new_state_vec = env.idx_state(new_state) episode.append(Transition(state = state_vec, action = action_chosen, next_state = new_state_vec, reward = reward)) if done or t == max_steps: break state_idx = new_state # Discourage the agent when it chooses an invalid step if len(state_batch_negative) != 0 and done != 1: print ('Penalty to invalid steps:', len(state_batch_negative)) policy_nn.zero_grad() action_mask = byteTensor(convert_to_one_hot(np.array(action_batch_negative), depth = action_space)) # action_prob = torch.stack(action_prob_batch_negative).squeeze(1) # print(state_batch_negative[0].shape) state = torch.cat(state_batch_negative, dim=0) lstm_input = torch.cat(lstm_input_batch_negative, dim=1) hidden = torch.cat(hidden_batch_negative, dim=1) cell = torch.cat(cell_batch_negative, dim=1) now_embedding = torch.cat(now_embedding_batch_negative, dim=0) action_prob, lstm_output, hidden_new, cell_new = policy_nn(state, lstm_input, hidden, cell, now_embedding, neighbour_embeddings_list_batch_negative) # print(action_prob.shape) picked_action_prob = torch.masked_select(action_prob, action_mask) print(picked_action_prob) loss = -torch.sum(torch.log(picked_action_prob) * args.wrong_reward) # Reward for each invalid action is wrong_reward loss.backward(retain_graph=True) torch.nn.utils.clip_grad_norm(policy_nn.parameters(), 0.2) optimizer.step() print ('----- FINAL PATH -----') print ('\t'.join(env.path)) print ('PATH LENGTH', len(env.path)) print ('----- FINAL PATH -----') # If the agent success, do one optimization if done == 1: print ('Success') path_found_entity.append(path_clean(' -> '.join(env.path))) success += 1 # Compute the reward for a successful episode. path_length = len(env.path) length_reward = 1/path_length global_reward = 1 if len(path_found) != 0: path_found_embedding = [env.path_embedding(path.split(' -> ')) for path in path_found] curr_path_embedding = env.path_embedding(env.path_relations) path_found_embedding = np.reshape(path_found_embedding, (-1,embedding_dim)) cos_sim = cosine_similarity(path_found_embedding, curr_path_embedding) diverse_reward = -np.mean(cos_sim) print ('diverse_reward', diverse_reward) total_reward = args.global_reward_weight * global_reward + args.length_reward_weight * length_reward + args.diverse_reward_weight * diverse_reward else: total_reward = args.global_reward_weight * global_reward + (args.length_reward_weight + args.diverse_reward_weight) * length_reward path_found.add(' -> '.join(env.path_relations)) # total_reward = 0.1*global_reward + 0.9*length_reward policy_nn.zero_grad() action_mask = byteTensor(convert_to_one_hot(np.array(action_batch_positive), depth = action_space)) state = torch.cat(state_batch_positive, dim=0) lstm_input = torch.cat(lstm_input_batch_positive, dim=1) hidden = torch.cat(hidden_batch_positive, dim=1) cell = torch.cat(cell_batch_positive, dim=1) now_embedding = torch.cat(now_embedding_batch_positive, dim=0) action_prob, lstm_output, hidden_new, cell_new = policy_nn(state, lstm_input, hidden, cell, now_embedding, neighbour_embeddings_list_batch_positive) # print(action_prob.shape) picked_action_prob = torch.masked_select(action_prob, action_mask) loss = -torch.sum(torch.log(picked_action_prob) * total_reward) # The reward for each step of a successful episode is total_reward loss.backward(retain_graph=True) torch.nn.utils.clip_grad_norm(policy_nn.parameters(), 0.2) optimizer.step() else: if (len(state_batch_positive) != 0): # reward shaping if args.reward_shaping_model == "TransH": # print("Enters TransH.") head = ent_embedding[[env.entity2id_[sample[0]]]] rel_emb = rel_embedding[[env.relation2id_[relation.replace('_', ':')]]] norm = norm_embedding[[env.relation2id_[relation.replace('_', ':')]]] tail = ent_embedding[forward_node_list] head_proj = head - np.sum(head * norm, axis=1, keepdims=True) * norm tail_proj = tail - np.sum(tail * norm, axis=1, keepdims=True) * norm scores = -np.sum(np.abs(head_proj + rel_emb - tail_proj), axis = 1) # print(scores) elif args.reward_shaping_model == "TransR": # print("Enters TransR.") head = ent_embedding[[env.entity2id_[sample[0]]]] rel_emb = rel_embedding[[env.relation2id_[relation.replace('_', ':')]]] norm = norm_embedding[[env.relation2id_[relation.replace('_', ':')]]].squeeze(0) tail = ent_embedding[forward_node_list] head_proj = np.matmul(norm, head.T).T tail_proj = np.matmul(norm, tail.T).T scores = -np.sum(np.abs(head_proj + rel_emb - tail_proj), axis = 1) # print(scores) elif args.reward_shaping_model == "TransD": # print("Enters TransD.") head = ent_embedding[[env.entity2id_[sample[0]]]] head_norm = ent_norm_embedding[[env.entity2id_[sample[0]]]] tail = ent_embedding[forward_node_list] tail_norm = ent_norm_embedding[forward_node_list] rel_emb = rel_embedding[[env.relation2id_[relation.replace('_', ':')]]] rel_norm = rel_norm_embedding[[env.relation2id_[relation.replace('_', ':')]]] head_proj = head + np.sum(head * head_norm, axis=1, keepdims=True) * rel_norm tail_proj = tail + np.sum(tail * tail_norm, axis=1, keepdims=True) * rel_norm scores = -np.sum(np.abs(head_proj + rel_emb - tail_proj), axis = 1) # print(scores) elif args.reward_shaping_model == "ProjE": # print("Enter ProjE.") h = ent_embedding[[env.entity2id_[sample[0]]]] r = rel_embedding[[env.relation2id_[relation.replace('_', ':')]]] ent_mat = np.transpose(ent_embedding) hr = h * simple_hr_combination_weights[:100] + r * simple_hr_combination_weights[100:] hrt_res = np.matmul(np.tanh(hr + combination_bias_hr), ent_mat) scores = hrt_res[0][forward_node_list] scores = torch.log(torch.sigmoid(torch.FloatTensor(scores))).numpy() # print(scores) elif args.reward_shaping_model == "ConvE": # print("Enters ConvE.") rel_id = TransE_to_ConvE_id_relation[env.relation2id_[relation.replace('_', ':')]] head_id = TransE_to_ConvE_id_entity[env.entity2id_[sample[0]]] tail_id = [TransE_to_ConvE_id_entity[elem] for elem in forward_node_list] bs = ConvE_model.batch_size x_middle, output = ConvE_model(longTensor([head_id] + [0] * (bs - 1)), longTensor([rel_id] * bs)) scores = np.log(output[0][tail_id].detach().cpu().numpy() + 10 ** -30) # print(scores) else: head_embedding = ent_embedding[env.entity2id_[sample[0]]] query_embedding = rel_embedding[env.relation2id_[relation.replace('_', ':')]] tail_embedding = ent_embedding[forward_node_list] scores = -np.sum(np.abs(head_embedding + query_embedding - tail_embedding), axis = 1) policy_nn.zero_grad() action_mask = byteTensor(convert_to_one_hot(np.array(action_batch_positive), depth = action_space)) state = torch.cat(state_batch_positive, dim=0) lstm_input = torch.cat(lstm_input_batch_positive, dim=1) hidden = torch.cat(hidden_batch_positive, dim=1) cell = torch.cat(cell_batch_positive, dim=1) now_embedding = torch.cat(now_embedding_batch_positive, dim=0) action_prob, lstm_output, hidden_new, cell_new = policy_nn(state, lstm_input, hidden, cell, now_embedding, neighbour_embeddings_list_batch_positive) # print(action_prob.shape) picked_action_prob = torch.masked_select(action_prob, action_mask) # print(picked_action_prob) loss = -torch.sum(torch.log(picked_action_prob) * floatTensor(scores) * args.useless_reward) # The reward for each step of an unsuccessful episode is useless_reward loss.backward(retain_graph=True) torch.nn.utils.clip_grad_norm(policy_nn.parameters(), 0.2) optimizer.step() print ('Failed, Do one teacher guideline') # Force the agent to learn using a successful sample teacher_success_flag = False teacher_success_failed_times = 0 while (not teacher_success_flag) and teacher_success_failed_times < 3: try: good_episodes = teacher(sample[0], sample[1], 1, env, graphpath, knowledge_base = kb, output_mode = 1) # Episode's ID instead of state! if len(good_episodes) == 0: teacher_success_failed_times += 1 else: for item in good_episodes: if len(item) == 0: teacher_success_failed_times += 1 break teacher_state_batch = [] teacher_action_batch = [] teacher_now_embedding_batch = [] teacher_neighbour_embeddings_list_batch = [] total_reward = 0.0*1 + 1*1/len(item) for t, transition in enumerate(item): teacher_state_batch.append(floatTensor(env.idx_state(transition.state))) teacher_action_batch.append(transition.action) teacher_now_embedding_batch.append(floatTensor(env.entity2vec[[transition.state[0]]])) connected_node_list = [] if transition.state[0] in env.entity2link: for rel in env.entity2link[transition.state[0]]: connected_node_list.extend(env.entity2link[transition.state[0]][rel]) connected_node_list = list(set(connected_node_list)) # Remove duplicates if len(connected_node_list) == 0: if USE_CUDA: neighbour_embeddings_list = torch.zeros(1, embedding_dim).cuda() else: neighbour_embeddings_list = torch.zeros(1, embedding_dim) else: neighbour_embeddings_list = floatTensor(env.entity2vec[connected_node_list]) teacher_neighbour_embeddings_list_batch.append(neighbour_embeddings_list) if (len(teacher_state_batch) != 0): hidden_this_time = torch.zeros(3, 1, hidden_dim) cell_this_time = torch.zeros(3, 1, hidden_dim) if USE_CUDA: hidden_this_time = hidden_this_time.cuda() cell_this_time = cell_this_time.cuda() state_batch_teacher = [] lstm_input_batch_teacher = [] hidden_batch_teacher = [] cell_batch_teacher = [] for idx, state_vec in enumerate(teacher_state_batch): state_vec = floatTensor(state_vec) state = torch.cat([state_vec, hidden_this_time[-1]], dim=1) # Only use the last layer's output lstm_input = state_vec.unsqueeze(1) now_embedding = teacher_now_embedding_batch[idx] teacher_neighbour_embeddings_list = [teacher_neighbour_embeddings_list_batch[idx]] action_prob, lstm_output, hidden_new, cell_new = policy_nn(state, lstm_input, hidden_this_time, cell_this_time, now_embedding, teacher_neighbour_embeddings_list) # print(action_prob.shape) hidden_this_time = hidden_new cell_this_time = cell_new state_batch_teacher.append(state) lstm_input_batch_teacher.append(lstm_input) hidden_batch_teacher.append(hidden_this_time) cell_batch_teacher.append(cell_this_time) now_embedding = torch.cat(teacher_now_embedding_batch, dim=0) policy_nn.zero_grad() action_mask = byteTensor(convert_to_one_hot(np.array(teacher_action_batch), depth = action_space)) state = torch.cat(state_batch_teacher, dim=0) lstm_input = torch.cat(lstm_input_batch_teacher, dim=1) hidden = torch.cat(hidden_batch_teacher, dim=1) cell = torch.cat(cell_batch_teacher, dim=1) action_prob, lstm_output, hidden_new, cell_new = policy_nn(state, lstm_input, hidden, cell, now_embedding, teacher_neighbour_embeddings_list_batch) # print(action_prob.shape) picked_action_prob = torch.masked_select(action_prob, action_mask) loss = -torch.sum(torch.log(picked_action_prob) * args.teacher_reward) # The reward for each step of a teacher episode is teacher_reward loss.backward(retain_graph=True) torch.nn.utils.clip_grad_norm(policy_nn.parameters(), 0.2) optimizer.step() teacher_success_flag = True else: teacher_success_failed_times += 1 except Exception as e: print ('Teacher guideline failed') teacher_success_failed_times += 10 print ('Episode time: ', time.time() - start) print ('\n') print ("Retrain Success count: ", success) success_cnt_list.append(success) print ('Retrain Success percentage:', success/num_episodes) print (success_cnt_list) for path in path_found_entity: # Only successful paths rel_ent = path.split(' -> ') path_relation = [] for idx, item in enumerate(rel_ent): if idx%2 == 0: path_relation.append(item) path_relation_found.append(' -> '.join(path_relation)) relation_path_stats = collections.Counter(path_relation_found).items() relation_path_stats = sorted(relation_path_stats, key = lambda x:x[1], reverse=True) # Rank the paths according to their frequency. f = open(feature_stats, 'w') for item in relation_path_stats: f.write(item[0]+'\t'+str(item[1])+'\n') f.close() print ('Path stats saved') with open("logs/training/" + relation + ".out", 'a') as fw: fw.write(save_file_header + '_path_stats.txt' + '\n') fw.write('Retrain Success persentage: ' + str(success/num_episodes) + '\n') fw.write("Retrain success cnt list: ") fw.write(" ".join([str(elem) for elem in success_cnt_list]) + '\n') fw.write("\n") return
def _generate( self, sample: Dict[str, Dict[str, Tensor]], prefix_tokens: Optional[Tensor] = None, bos_token: Optional[int] = None, ): net_input = sample["net_input"] src_tokens = net_input["src_tokens"] if src_tokens.dim() > 2: src_lengths = net_input["src_lengths"] else: # length of the source text being the character length except EndOfSentence and pad src_lengths = ((src_tokens.ne(self.eos) & src_tokens.ne(self.pad)).long().sum(dim=1)) # bsz: total number of sentences in beam input_size = src_tokens.size() bsz, src_len = input_size[0], input_size[1] beam_size = self.beam_size max_len: int = -1 if self.match_source_len: max_len = src_lengths.max().item() else: max_len = min( int(self.max_len_a * src_len + self.max_len_b), # exclude the EOS marker self.model.max_decoder_positions() - 1, ) assert ( self.min_len <= max_len ), "min_len cannot be larger than max_len, please adjust these!" # compute the encoder output for each beam encoder_outs = self.model.forward_encoder(net_input) # placeholder of indices for bsz * beam_size to hold tokens and accumulative scores new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1) new_order = new_order.to(src_tokens.device).long() encoder_outs = self.model.reorder_encoder_out(encoder_outs, new_order) # ensure encoder_outs is a List. assert encoder_outs is not None # initialize buffers scores = (torch.zeros(bsz * beam_size, max_len + 1).to(src_tokens).float() ) # +1 for eos; pad is never choosed for scoring tokens = (torch.zeros(bsz * beam_size, max_len + 2).to(src_tokens).long().fill_( self.pad)) # +2 for eos and pad tokens[:, 0] = self.eos if bos_token is None else bos_token attn: Optional[Tensor] = None # The blacklist indicates candidates that should be ignored. # For example, suppose we're sampling and have already finalized 2/5 # samples. Then the blacklist would mark 2 positions as being ignored, # so that we only finalize the remaining 3 samples. blacklist = (torch.zeros(bsz, beam_size).to(src_tokens).eq(-1) ) # forward and backward-compatible False mask # list of completed sentences finalized = torch.jit.annotate( List[List[Dict[str, Tensor]]], [ torch.jit.annotate(List[Dict[str, Tensor]], []) for i in range(bsz) ], ) # contains lists of dictionaries of infomation about the hypothesis being finalized at each step finished = [ False for i in range(bsz) ] # a boolean array indicating if the sentence at the index is finished or not num_remaining_sent = bsz # number of sentences remaining # number of candidate hypos per step cand_size = 2 * beam_size # 2 x beam size in case half are EOS # offset arrays for converting between different indexing schemes bbsz_offsets = (torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens) cand_offsets = torch.arange(0, cand_size).type_as(tokens) reorder_state: Optional[Tensor] = None batch_idxs: Optional[Tensor] = None for step in range(max_len + 1): # one extra step for EOS marker # reorder decoder internal states based on the prev choice of beams # print(f'step: {step}') if reorder_state is not None: if batch_idxs is not None: # update beam indices to take into account removed sentences corr = batch_idxs - torch.arange( batch_idxs.numel()).type_as(batch_idxs) reorder_state.view(-1, beam_size).add_( corr.unsqueeze(-1) * beam_size) self.model.reorder_incremental_state(reorder_state) encoder_outs = self.model.reorder_encoder_out( encoder_outs, reorder_state) lprobs, avg_attn_scores = self.model.forward_decoder( tokens[:, :step + 1], encoder_outs, self.temperature) lprobs[lprobs != lprobs] = torch.tensor(-math.inf).to(lprobs) lprobs[:, self.pad] = -math.inf # never select pad lprobs[:, self.unk] -= self.unk_penalty # apply unk penalty # handle max length constraint if step >= max_len: lprobs[:, :self.eos] = -math.inf lprobs[:, self.eos + 1:] = -math.inf elif self.eos_factor is not None: # only consider EOS if its score is no less than a specified # factor of the best candidate score disallow_eos_mask = lprobs[:, self. eos] < self.eos_factor * lprobs.max( dim=1)[0] lprobs[disallow_eos_mask, self.eos] = -math.inf # handle prefix tokens (possibly with different lengths) if (prefix_tokens is not None and step < prefix_tokens.size(1) and step < max_len): lprobs, tokens, scores = self._prefix_tokens( step, lprobs, scores, tokens, prefix_tokens, beam_size) elif step < self.min_len: # minimum length constraint (does not apply if using prefix_tokens) lprobs[:, self.eos] = -math.inf # Record attention scores, only support avg_attn_scores is a Tensor if avg_attn_scores is not None: if attn is None: attn = torch.empty(bsz * beam_size, avg_attn_scores.size(1), max_len + 2).to(scores) attn[:, :, step + 1].copy_(avg_attn_scores) scores = scores.type_as(lprobs) eos_bbsz_idx = torch.empty(0).to( tokens ) # indices of hypothesis ending with eos (finished sentences) eos_scores = torch.empty(0).to( scores ) # scores of hypothesis ending with eos (finished sentences) self.search.set_src_lengths(src_lengths) if self.no_repeat_ngram_size > 0: lprobs = self._no_repeat_ngram(tokens, lprobs, bsz, beam_size, step) cand_scores, cand_indices, cand_beams = self.search.step( step, lprobs.view(bsz, -1, self.vocab_size), scores.view(bsz, beam_size, -1)[:, :, :step], ) # cand_bbsz_idx contains beam indices for the top candidate # hypotheses, with a range of values: [0, bsz*beam_size), # and dimensions: [bsz, cand_size] cand_bbsz_idx = cand_beams.add(bbsz_offsets) # finalize hypotheses that end in eos eos_mask = cand_indices.eq(self.eos) & cand_scores.ne(-math.inf) eos_mask[:, :beam_size][blacklist] = torch.tensor(0).to(eos_mask) # only consider eos when it's among the top beam_size indices eos_bbsz_idx = torch.masked_select(cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size]) finalized_sents: List[int] = [] if eos_bbsz_idx.numel() > 0: eos_scores = torch.masked_select(cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size]) finalized_sents = self.finalize_hypos( step, eos_bbsz_idx, eos_scores, tokens, scores, finalized, finished, beam_size, attn, src_lengths, max_len, ) num_remaining_sent -= len(finalized_sents) assert num_remaining_sent >= 0 if num_remaining_sent == 0: break assert step < max_len if len(finalized_sents) > 0: new_bsz = bsz - len(finalized_sents) # construct batch_idxs which holds indices of batches to keep for the next pass batch_mask = torch.ones(bsz).to(cand_indices) batch_mask[torch.tensor(finalized_sents).to( cand_indices)] = torch.tensor(0).to(batch_mask) batch_idxs = batch_mask.nonzero().squeeze(-1) eos_mask = eos_mask[batch_idxs] cand_beams = cand_beams[batch_idxs] bbsz_offsets.resize_(new_bsz, 1) cand_bbsz_idx = cand_beams.add(bbsz_offsets) cand_scores = cand_scores[batch_idxs] cand_indices = cand_indices[batch_idxs] if prefix_tokens is not None: prefix_tokens = prefix_tokens[batch_idxs] src_lengths = src_lengths[batch_idxs] blacklist = blacklist[batch_idxs] scores = scores.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, -1) tokens = tokens.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, -1) if attn is not None: attn = attn.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, attn.size(1), -1) bsz = new_bsz else: batch_idxs = None # set active_mask so that values > cand_size indicate eos hypos # and values < cand_size indicate candidate active hypos. # After, the min values per row are the top candidate active hypos # Rewrite the operator since the element wise or is not supported in torchscript. eos_mask[:, :beam_size] = ~((~blacklist) & (~eos_mask[:, :beam_size])) active_mask = torch.add( eos_mask.type_as(cand_offsets) * cand_size, cand_offsets[:eos_mask.size(1)], ) # get the top beam_size active hypotheses, which are just the hypos # with the smallest values in active_mask new_blacklist, active_hypos = torch.topk(active_mask, k=beam_size, dim=1, largest=False) # update blacklist to ignore any finalized hypos blacklist = new_blacklist.ge(cand_size)[:, :beam_size] assert (~blacklist).any(dim=1).all() active_bbsz_idx = torch.gather(cand_bbsz_idx, dim=1, index=active_hypos) active_scores = torch.gather(cand_scores, dim=1, index=active_hypos) active_bbsz_idx = active_bbsz_idx.view(-1) active_scores = active_scores.view(-1) # copy tokens and scores for active hypotheses tokens[:, :step + 1] = torch.index_select(tokens[:, :step + 1], dim=0, index=active_bbsz_idx) tokens.view(bsz, beam_size, -1)[:, :, step + 1] = torch.gather(cand_indices, dim=1, index=active_hypos) if step > 0: scores[:, :step] = torch.index_select(scores[:, :step], dim=0, index=active_bbsz_idx) scores.view(bsz, beam_size, -1)[:, :, step] = torch.gather(cand_scores, dim=1, index=active_hypos) # copy attention for active hypotheses if attn is not None: attn[:, :, :step + 2] = torch.index_select( attn[:, :, :step + 2], dim=0, index=active_bbsz_idx) # reorder incremental state in decoder reorder_state = active_bbsz_idx # sort by score descending for sent in range(len(finalized)): # make into beam container BCList = [ BeamContainer(elem["score"].item(), elem) for elem in finalized[sent] ] BCList.sort() BCList.reverse() finalized[sent] = torch.jit.annotate(List[Dict[str, Tensor]], [x.elem for x in BCList]) return finalized
def _generate(self, src_tokens, src_lengths, beam_size=None, maxlen=None, prefix_tokens=None): bsz, srclen = src_tokens.size() maxlen = min(maxlen, self.maxlen) if maxlen is not None else self.maxlen # the max beam size is the dictionary size - 1, since we never select pad beam_size = beam_size if beam_size is not None else self.beam_size beam_size = min(beam_size, self.vocab_size - 1) encoder_outs = [] incremental_states = {} for model in self.models: if not self.retain_dropout: model.eval() if isinstance(model.decoder, FairseqIncrementalDecoder): incremental_states[model] = {} else: incremental_states[model] = None # compute the encoder output for each beam encoder_out = model.encoder( src_tokens.repeat(1, beam_size).view(-1, srclen), src_lengths.expand(beam_size, src_lengths.numel()).t().contiguous().view(-1), ) encoder_outs.append(encoder_out) # initialize buffers scores = src_tokens.data.new(bsz * beam_size, maxlen + 1).float().fill_(0) scores_buf = scores.clone() tokens = src_tokens.data.new(bsz * beam_size, maxlen + 2).fill_(self.pad) tokens_buf = tokens.clone() tokens[:, 0] = self.eos attn, attn_buf = None, None nonpad_idxs = None # list of completed sentences finalized = [[] for i in range(bsz)] finished = [False for i in range(bsz)] worst_finalized = [{'idx': None, 'score': -math.inf} for i in range(bsz)] num_remaining_sent = bsz # number of candidate hypos per step cand_size = 2 * beam_size # 2 x beam size in case half are EOS # offset arrays for converting between different indexing schemes bbsz_offsets = (torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens) cand_offsets = torch.arange(0, cand_size).type_as(tokens) # helper function for allocating buffers on the fly buffers = {} def buffer(name, type_of=tokens): # noqa if name not in buffers: buffers[name] = type_of.new() return buffers[name] def is_finished(sent, step, unfinalized_scores=None): """ Check whether we've finished generation for a given sentence, by comparing the worst score among finalized hypotheses to the best possible score among unfinalized hypotheses. """ assert len(finalized[sent]) <= beam_size if len(finalized[sent]) == beam_size: if self.stop_early or step == maxlen or unfinalized_scores is None: return True # stop if the best unfinalized score is worse than the worst # finalized one best_unfinalized_score = unfinalized_scores[sent].max() if self.normalize_scores: best_unfinalized_score /= maxlen ** self.len_penalty if worst_finalized[sent]['score'] >= best_unfinalized_score: return True return False def finalize_hypos(step, bbsz_idx, eos_scores, unfinalized_scores=None): """ Finalize the given hypotheses at this step, while keeping the total number of finalized hypotheses per sentence <= beam_size. Note: the input must be in the desired finalization order, so that hypotheses that appear earlier in the input are preferred to those that appear later. Args: step: current time step bbsz_idx: A vector of indices in the range [0, bsz*beam_size), indicating which hypotheses to finalize eos_scores: A vector of the same size as bbsz_idx containing scores for each hypothesis unfinalized_scores: A vector containing scores for all unfinalized hypotheses """ assert bbsz_idx.numel() == eos_scores.numel() # clone relevant token and attention tensors tokens_clone = tokens.index_select(0, bbsz_idx) tokens_clone = tokens_clone[:, 1:step + 2] # skip the first index, which is EOS tokens_clone[:, step] = self.eos attn_clone = attn.index_select(0, bbsz_idx)[:, :, 1:step+2] if attn is not None else None # compute scores per token position pos_scores = scores.index_select(0, bbsz_idx)[:, :step+1] pos_scores[:, step] = eos_scores # convert from cumulative to per-position scores pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1] # normalize sentence-level scores if self.normalize_scores: eos_scores /= (step + 1) ** self.len_penalty cum_unfin = [] prev = 0 for f in finished: if f: prev += 1 else: cum_unfin.append(prev) sents_seen = set() for i, (idx, score) in enumerate(zip(bbsz_idx.tolist(), eos_scores.tolist())): unfin_idx = idx // beam_size sent = unfin_idx + cum_unfin[unfin_idx] sents_seen.add((sent, unfin_idx)) def get_hypo(): if attn_clone is not None: # remove padding tokens from attn scores hypo_attn = attn_clone[i][nonpad_idxs[sent]] _, alignment = hypo_attn.max(dim=0) else: hypo_attn = None alignment = None return { 'tokens': tokens_clone[i], 'score': score, 'attention': hypo_attn, # src_len x tgt_len 'alignment': alignment, 'positional_scores': pos_scores[i], } if len(finalized[sent]) < beam_size: finalized[sent].append(get_hypo()) elif not self.stop_early and score > worst_finalized[sent]['score']: # replace worst hypo for this sentence with new/better one worst_idx = worst_finalized[sent]['idx'] if worst_idx is not None: finalized[sent][worst_idx] = get_hypo() # find new worst finalized hypo for this sentence idx, s = min(enumerate(finalized[sent]), key=lambda r: r[1]['score']) worst_finalized[sent] = { 'score': s['score'], 'idx': idx, } newly_finished = [] for sent, unfin_idx in sents_seen: # check termination conditions for this sentence if not finished[sent] and is_finished(sent, step, unfinalized_scores): finished[sent] = True newly_finished.append(unfin_idx) return newly_finished reorder_state = None batch_idxs = None for step in range(maxlen + 1): # one extra step for EOS marker # reorder decoder internal states based on the prev choice of beams if reorder_state is not None: if batch_idxs is not None: # update beam indices to take into account removed sentences corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as(batch_idxs) reorder_state.view(-1, beam_size).add_(corr.unsqueeze(-1) * beam_size) for i, model in enumerate(self.models): if isinstance(model.decoder, FairseqIncrementalDecoder): model.decoder.reorder_incremental_state(incremental_states[model], reorder_state) encoder_outs[i] = model.encoder.reorder_encoder_out(encoder_outs[i], reorder_state) lprobs, avg_attn_scores = self._decode(tokens[:, :step + 1], encoder_outs, incremental_states) lprobs[:, self.pad] = -math.inf # never select pad lprobs[:, self.unk] -= self.unk_penalty # apply unk penalty # Record attention scores if avg_attn_scores is not None: if attn is None: attn = scores.new(bsz * beam_size, src_tokens.size(1), maxlen + 2) attn_buf = attn.clone() nonpad_idxs = src_tokens.ne(self.pad) attn[:, :, step + 1].copy_(avg_attn_scores) scores = scores.type_as(lprobs) scores_buf = scores_buf.type_as(lprobs) eos_bbsz_idx = buffer('eos_bbsz_idx') eos_scores = buffer('eos_scores', type_of=scores) if step < maxlen: if prefix_tokens is not None and step < prefix_tokens.size(1): probs_slice = lprobs.view(bsz, -1, lprobs.size(-1))[:, 0, :] cand_scores = torch.gather( probs_slice, dim=1, index=prefix_tokens[:, step].view(-1, 1).data ).expand(-1, cand_size) cand_indices = prefix_tokens[:, step].view(-1, 1).expand(bsz, cand_size).data cand_beams = torch.zeros_like(cand_indices) else: cand_scores, cand_indices, cand_beams = self.search.step( step, lprobs.view(bsz, -1, self.vocab_size), scores.view(bsz, beam_size, -1)[:, :, :step], ) else: # make probs contain cumulative scores for each hypothesis lprobs.add_(scores[:, step - 1].unsqueeze(-1)) # finalize all active hypotheses once we hit maxlen # pick the hypothesis with the highest prob of EOS right now torch.sort( lprobs[:, self.eos], descending=True, out=(eos_scores, eos_bbsz_idx), ) num_remaining_sent -= len(finalize_hypos( step, eos_bbsz_idx, eos_scores)) assert num_remaining_sent == 0 break # cand_bbsz_idx contains beam indices for the top candidate # hypotheses, with a range of values: [0, bsz*beam_size), # and dimensions: [bsz, cand_size] cand_bbsz_idx = cand_beams.add(bbsz_offsets) # finalize hypotheses that end in eos eos_mask = cand_indices.eq(self.eos) finalized_sents = set() if step >= self.minlen: # only consider eos when it's among the top beam_size indices torch.masked_select( cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size], out=eos_bbsz_idx, ) if eos_bbsz_idx.numel() > 0: torch.masked_select( cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size], out=eos_scores, ) finalized_sents = finalize_hypos( step, eos_bbsz_idx, eos_scores, cand_scores) num_remaining_sent -= len(finalized_sents) assert num_remaining_sent >= 0 if num_remaining_sent == 0: break assert step < maxlen if len(finalized_sents) > 0: new_bsz = bsz - len(finalized_sents) # construct batch_idxs which holds indices of batches to keep for the next pass batch_mask = cand_indices.new_ones(bsz) batch_mask[cand_indices.new(finalized_sents)] = 0 batch_idxs = batch_mask.nonzero().squeeze(-1) eos_mask = eos_mask[batch_idxs] cand_beams = cand_beams[batch_idxs] bbsz_offsets.resize_(new_bsz, 1) cand_bbsz_idx = cand_beams.add(bbsz_offsets) cand_scores = cand_scores[batch_idxs] cand_indices = cand_indices[batch_idxs] if prefix_tokens is not None: prefix_tokens = prefix_tokens[batch_idxs] scores = scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) scores_buf.resize_as_(scores) tokens = tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) tokens_buf.resize_as_(tokens) if attn is not None: attn = attn.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, attn.size(1), -1) attn_buf.resize_as_(attn) bsz = new_bsz else: batch_idxs = None # set active_mask so that values > cand_size indicate eos hypos # and values < cand_size indicate candidate active hypos. # After, the min values per row are the top candidate active hypos active_mask = buffer('active_mask') torch.add( eos_mask.type_as(cand_offsets) * cand_size, cand_offsets[:eos_mask.size(1)], out=active_mask, ) # get the top beam_size active hypotheses, which are just the hypos # with the smallest values in active_mask active_hypos, _ignore = buffer('active_hypos'), buffer('_ignore') torch.topk( active_mask, k=beam_size, dim=1, largest=False, out=(_ignore, active_hypos) ) active_bbsz_idx = buffer('active_bbsz_idx') torch.gather( cand_bbsz_idx, dim=1, index=active_hypos, out=active_bbsz_idx, ) active_scores = torch.gather( cand_scores, dim=1, index=active_hypos, out=scores[:, step].view(bsz, beam_size), ) active_bbsz_idx = active_bbsz_idx.view(-1) active_scores = active_scores.view(-1) # copy tokens and scores for active hypotheses torch.index_select( tokens[:, :step + 1], dim=0, index=active_bbsz_idx, out=tokens_buf[:, :step + 1], ) torch.gather( cand_indices, dim=1, index=active_hypos, out=tokens_buf.view(bsz, beam_size, -1)[:, :, step + 1], ) if step > 0: torch.index_select( scores[:, :step], dim=0, index=active_bbsz_idx, out=scores_buf[:, :step], ) torch.gather( cand_scores, dim=1, index=active_hypos, out=scores_buf.view(bsz, beam_size, -1)[:, :, step], ) # copy attention for active hypotheses if attn is not None: torch.index_select( attn[:, :, :step + 2], dim=0, index=active_bbsz_idx, out=attn_buf[:, :, :step + 2], ) # swap buffers tokens, tokens_buf = tokens_buf, tokens scores, scores_buf = scores_buf, scores if attn is not None: attn, attn_buf = attn_buf, attn # reorder incremental state in decoder reorder_state = active_bbsz_idx # sort by score descending for sent in range(len(finalized)): finalized[sent] = sorted(finalized[sent], key=lambda r: r['score'], reverse=True) return finalized
在[N, 4]的数据找到[N, 1]的掩码, 最终得到的值是[M] 将数据扯平 torch中的大于、小于、大于等于、小于等于 等于 gt(greater than) lt(less than) ge(greater than or equal to) le(less equal) eq(equal) """ seed = torch.manual_seed(0) cls_label = torch.randint(0, 3, (10, 1)) print(cls_label.shape) offset_label = torch.randn(10, 4) print(offset_label.shape) '方法一:' mask_cls = torch.lt(cls_label, 2) # [M, 1] 是一个二维的数据,如果用索引的方法需要降维取值 print(mask_cls.shape) # torch.Size([10, 1]) # exit() cls = torch.masked_select(cls_label, mask_cls) print(cls, cls.shape) # tensor([0, 0, 1, 0, 1, 1, 1, 0]) torch.Size([8]) mask_offset = torch.gt(cls_label, 0) print(mask_offset.shape) # torch.Size([10, 1]) offset = torch.masked_select(offset_label, mask_offset) print(offset, offset.shape) # 这里数据的顺序没有发生改变,因此可以,与下面相同 print(offset.reshape(-1, 4), offset.reshape(-1, 4).shape) print("-----------------------------") '方法二:使用传统方式' mask_cls = cls_label[:, 0] < 2 cls = cls_label[mask_cls] print(cls.shape) mask_offset = cls_label[:, 0] > 0
def updateOutput(self, input): input, mask = input torch.masked_select(input, mask, out=self.output) return self.output
def apply_mask(inp, mask, size=9): return torch.masked_select(inp.transpose(0,-1),mask).view(size,-1).transpose(0,1)
def test_net(save_folder, net, cuda, dataset, transform, top_k, im_size=300, thresh=0.05): """Test a Fast R-CNN network on an image database.""" num_images = len(dataset) # all detections are collected into: # all_boxes[cls][image] = N x 5 array of detections in # (x1, y1, x2, y2, score) """ all_boxes = [[[] for _ in range(num_images)] for _ in range(len(labelmap)+1)] """ all_boxes = [[[] for _ in range(num_images)] for _ in range(len(labelmap))] # timers _t = {'im_detect': Timer(), 'misc': Timer()} output_dir = get_output_dir('ssd300_120000', set_type) det_file = os.path.join(output_dir, 'detections.pkl') for i in range(num_images): im, gt, h, w = dataset.pull_item(i) x = Variable(im.unsqueeze(0)) """ if args.cuda: x = x.cuda() """ _t['im_detect'].tic() detections = net(x).data detect_time = _t['im_detect'].toc(average=False) # skip j = 0, because it's the background class for j in range(1, detections.size(1)): dets = detections[0, j, :] mask = dets[:, 0].gt(0.).expand(5, dets.size(0)).t() dets = torch.masked_select(dets, mask).view(-1, 5) if dets.dim() == 0: continue boxes = dets[:, 1:] boxes[:, 0] *= w boxes[:, 2] *= w boxes[:, 1] *= h boxes[:, 3] *= h scores = dets[:, 0].cpu().numpy() cls_dets = np.hstack((boxes.cpu().numpy(), scores[:, np.newaxis])) \ .astype(np.float32, copy=False) all_boxes[j][i] = cls_dets print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1, num_images, detect_time)) with open(det_file, 'wb') as f: pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL) print('Evaluating detections') evaluate_detections(all_boxes, output_dir, dataset)
if __name__ == '__main__': opts = { 'dim_mm': 6, 'dim_ho': 4, } nms_module = Dumplicate_Removal(opts) visual_features = Variable(torch.normal(torch.zeros(10, 4))) rois = Variable( torch.cat((torch.zeros(10, 1), (torch.rand(10, 4) + torch.FloatTensor([ [0, 1, 2, 3], ])) * 100), dim=1)) duplicate_labels = Variable(torch.ones(5, 1)).type(torch.LongTensor) cls_prob_object = Variable(torch.rand(10, 20)) mask = torch.zeros_like(cls_prob_object[:duplicate_labels.size(0)]).type( torch.ByteTensor) for i in range(duplicate_labels.size(0)): mask[i, duplicate_labels.data[i][0]] = 1 selected_prob = torch.masked_select( cls_prob_object[:duplicate_labels.size(0)], mask) reranked_score = nms_module(visual_features[:duplicate_labels.size(0)], selected_prob, rois[:duplicate_labels.size(0)]) selected_prob = selected_prob.unsqueeze(1) * reranked_score loss = F.binary_cross_entropy(selected_prob, duplicate_labels.float()) loss.backward() print(nms_module.transform_rescore.weight.grad)
def _generate(self, src_tokens, src_lengths, beam_size=None, maxlen=None, prefix_tokens=None): bsz, srclen = src_tokens.size() maxlen = min(maxlen, self.maxlen) if maxlen is not None else self.maxlen # the max beam size is the dictionary size - 1, since we never select pad beam_size = beam_size if beam_size is not None else self.beam_size beam_size = min(beam_size, self.vocab_size - 1) encoder_outs = [] incremental_states = {} for model in self.models: if not self.retain_dropout: model.eval() if isinstance(model.decoder, FairseqIncrementalDecoder): incremental_states[model] = {} else: incremental_states[model] = None # compute the encoder output for each beam encoder_out = model.encoder( src_tokens.repeat(1, beam_size).view(-1, srclen), src_lengths.expand( beam_size, src_lengths.numel()).t().contiguous().view(-1), ) encoder_outs.append(encoder_out) # initialize buffers scores = src_tokens.data.new(bsz * beam_size, maxlen + 1).float().fill_(0) scores_buf = scores.clone() tokens = src_tokens.data.new(bsz * beam_size, maxlen + 2).fill_(self.pad) tokens_buf = tokens.clone() tokens[:, 0] = self.eos attn = scores.new(bsz * beam_size, src_tokens.size(1), maxlen + 2) attn_buf = attn.clone() # list of completed sentences finalized = [[] for i in range(bsz)] finished = [False for i in range(bsz)] worst_finalized = [{ 'idx': None, 'score': -math.inf } for i in range(bsz)] num_remaining_sent = bsz # number of candidate hypos per step cand_size = 2 * beam_size # 2 x beam size in case half are EOS # offset arrays for converting between different indexing schemes bbsz_offsets = (torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens) cand_offsets = torch.arange(0, cand_size).type_as(tokens) # helper function for allocating buffers on the fly buffers = {} def buffer(name, type_of=tokens): # noqa if name not in buffers: buffers[name] = type_of.new() return buffers[name] def is_finished(sent, step, unfinalized_scores=None): """ Check whether we've finished generation for a given sentence, by comparing the worst score among finalized hypotheses to the best possible score among unfinalized hypotheses. """ assert len(finalized[sent]) <= beam_size if len(finalized[sent]) == beam_size: if self.stop_early or step == maxlen or unfinalized_scores is None: return True # stop if the best unfinalized score is worse than the worst # finalized one best_unfinalized_score = unfinalized_scores[sent].max() if self.normalize_scores: best_unfinalized_score /= maxlen**self.len_penalty if worst_finalized[sent]['score'] >= best_unfinalized_score: return True return False def finalize_hypos(step, bbsz_idx, eos_scores, unfinalized_scores=None): """ Finalize the given hypotheses at this step, while keeping the total number of finalized hypotheses per sentence <= beam_size. Note: the input must be in the desired finalization order, so that hypotheses that appear earlier in the input are preferred to those that appear later. Args: step: current time step bbsz_idx: A vector of indices in the range [0, bsz*beam_size), indicating which hypotheses to finalize eos_scores: A vector of the same size as bbsz_idx containing scores for each hypothesis unfinalized_scores: A vector containing scores for all unfinalized hypotheses """ assert bbsz_idx.numel() == eos_scores.numel() # clone relevant token and attention tensors tokens_clone = tokens.index_select(0, bbsz_idx) tokens_clone = tokens_clone[:, 1:step + 2] # skip the first index, which is EOS tokens_clone[:, step] = self.eos attn_clone = attn.index_select(0, bbsz_idx)[:, :, 1:step + 2] # compute scores per token position pos_scores = scores.index_select(0, bbsz_idx)[:, :step + 1] pos_scores[:, step] = eos_scores # convert from cumulative to per-position scores pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1] # normalize sentence-level scores if self.normalize_scores: eos_scores /= (step + 1)**self.len_penalty cum_unfin = [] prev = 0 for f in finished: if f: prev += 1 else: cum_unfin.append(prev) sents_seen = set() for i, (idx, score) in enumerate( zip(bbsz_idx.tolist(), eos_scores.tolist())): unfin_idx = idx // beam_size sent = unfin_idx + cum_unfin[unfin_idx] sents_seen.add((sent, unfin_idx)) def get_hypo(): # remove padding tokens from attn scores nonpad_idxs = src_tokens[sent].ne(self.pad) hypo_attn = attn_clone[i][nonpad_idxs] _, alignment = hypo_attn.max(dim=0) return { 'tokens': tokens_clone[i], 'score': score, 'attention': hypo_attn, # src_len x tgt_len 'alignment': alignment, 'positional_scores': pos_scores[i], } if len(finalized[sent]) < beam_size: finalized[sent].append(get_hypo()) elif not self.stop_early and score > worst_finalized[sent][ 'score']: # replace worst hypo for this sentence with new/better one worst_idx = worst_finalized[sent]['idx'] if worst_idx is not None: finalized[sent][worst_idx] = get_hypo() # find new worst finalized hypo for this sentence idx, s = min(enumerate(finalized[sent]), key=lambda r: r[1]['score']) worst_finalized[sent] = { 'score': s['score'], 'idx': idx, } newly_finished = [] for sent, unfin_idx in sents_seen: # check termination conditions for this sentence if not finished[sent] and is_finished(sent, step, unfinalized_scores): finished[sent] = True newly_finished.append(unfin_idx) return newly_finished reorder_state = None batch_idxs = None for step in range(maxlen + 1): # one extra step for EOS marker # reorder decoder internal states based on the prev choice of beams if reorder_state is not None: if batch_idxs is not None: # update beam indices to take into account removed sentences corr = batch_idxs - torch.arange( batch_idxs.numel()).type_as(batch_idxs) reorder_state.view(-1, beam_size).add_( corr.unsqueeze(-1) * beam_size) for i, model in enumerate(self.models): if isinstance(model.decoder, FairseqIncrementalDecoder): model.decoder.reorder_incremental_state( incremental_states[model], reorder_state) encoder_outs[i] = model.decoder.reorder_encoder_out( encoder_outs[i], reorder_state) probs, avg_attn_scores = self._decode(tokens[:, :step + 1], encoder_outs, incremental_states) if step == 0: # at the first step all hypotheses are equally likely, so use # only the first beam probs = probs.unfold(0, 1, beam_size).squeeze(2).contiguous() scores = scores.type_as(probs) scores_buf = scores_buf.type_as(probs) elif not self.sampling: # make probs contain cumulative scores for each hypothesis probs.add_(scores[:, step - 1].view(-1, 1)) probs[:, self.pad] = -math.inf # never select pad probs[:, self.unk] -= self.unk_penalty # apply unk penalty # Record attention scores attn[:, :, step + 1].copy_(avg_attn_scores) cand_scores = buffer('cand_scores', type_of=scores) cand_indices = buffer('cand_indices') cand_beams = buffer('cand_beams') eos_bbsz_idx = buffer('eos_bbsz_idx') eos_scores = buffer('eos_scores', type_of=scores) if step < maxlen: if prefix_tokens is not None and step < prefix_tokens.size(1): probs_slice = probs.view(bsz, -1, probs.size(-1))[:, 0, :] cand_scores = torch.gather( probs_slice, dim=1, index=prefix_tokens[:, step].view(-1, 1).data).expand( -1, cand_size) cand_indices = prefix_tokens[:, step].view(-1, 1).expand( bsz, cand_size).data cand_beams.resize_as_(cand_indices).fill_(0) elif self.sampling: assert self.pad == 1, 'sampling assumes the first two symbols can be ignored' if self.sampling_topk > 0: values, indices = probs[:, 2:].topk(self.sampling_topk) exp_probs = values.div_( self.sampling_temperature).exp() if step == 0: torch.multinomial(exp_probs, beam_size, replacement=True, out=cand_indices) else: torch.multinomial(exp_probs, 1, replacement=True, out=cand_indices) torch.gather(exp_probs, dim=1, index=cand_indices, out=cand_scores) torch.gather(indices, dim=1, index=cand_indices, out=cand_indices) cand_indices.add_(2) else: exp_probs = probs.div_( self.sampling_temperature).exp_().view( -1, self.vocab_size) if step == 0: # we exclude the first two vocab items, one of which is pad torch.multinomial(exp_probs[:, 2:], beam_size, replacement=True, out=cand_indices) else: torch.multinomial(exp_probs[:, 2:], 1, replacement=True, out=cand_indices) cand_indices.add_(2) torch.gather(exp_probs, dim=1, index=cand_indices, out=cand_scores) cand_scores.log_() cand_indices = cand_indices.view(bsz, -1).repeat(1, 2) cand_scores = cand_scores.view(bsz, -1).repeat(1, 2) if step == 0: cand_beams = torch.zeros( bsz, cand_size).type_as(cand_indices) else: cand_beams = torch.arange(0, beam_size).repeat( bsz, 2).type_as(cand_indices) # make scores cumulative cand_scores.add_( torch.gather( scores[:, step - 1].view(bsz, beam_size), dim=1, index=cand_beams, )) else: # take the best 2 x beam_size predictions. We'll choose the first # beam_size of these which don't predict eos to continue with. torch.topk( probs.view(bsz, -1), k=min(cand_size, probs.view(bsz, -1).size(1) - 1), # -1 so we never select pad out=(cand_scores, cand_indices), ) torch.div(cand_indices, self.vocab_size, out=cand_beams) cand_indices.fmod_(self.vocab_size) else: # finalize all active hypotheses once we hit maxlen # pick the hypothesis with the highest prob of EOS right now torch.sort( probs[:, self.eos], descending=True, out=(eos_scores, eos_bbsz_idx), ) num_remaining_sent -= len( finalize_hypos(step, eos_bbsz_idx, eos_scores)) assert num_remaining_sent == 0 break # cand_bbsz_idx contains beam indices for the top candidate # hypotheses, with a range of values: [0, bsz*beam_size), # and dimensions: [bsz, cand_size] cand_bbsz_idx = cand_beams.add(bbsz_offsets) # finalize hypotheses that end in eos eos_mask = cand_indices.eq(self.eos) finalized_sents = set() if step >= self.minlen: # only consider eos when it's among the top beam_size indices torch.masked_select( cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size], out=eos_bbsz_idx, ) if eos_bbsz_idx.numel() > 0: torch.masked_select( cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size], out=eos_scores, ) finalized_sents = finalize_hypos(step, eos_bbsz_idx, eos_scores, cand_scores) num_remaining_sent -= len(finalized_sents) assert num_remaining_sent >= 0 if num_remaining_sent == 0: break assert step < maxlen if len(finalized_sents) > 0: new_bsz = bsz - len(finalized_sents) # construct batch_idxs which holds indices of batches to keep for the next pass batch_mask = torch.ones(bsz).type_as(cand_indices) batch_mask[cand_indices.new(finalized_sents)] = 0 batch_idxs = batch_mask.nonzero().squeeze(-1) eos_mask = eos_mask[batch_idxs] cand_beams = cand_beams[batch_idxs] bbsz_offsets.resize_(new_bsz, 1) cand_bbsz_idx = cand_beams.add(bbsz_offsets) cand_scores = cand_scores[batch_idxs] cand_indices = cand_indices[batch_idxs] if prefix_tokens is not None: prefix_tokens = prefix_tokens[batch_idxs] scores = scores.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, -1) scores_buf.resize_as_(scores) tokens = tokens.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, -1) tokens_buf.resize_as_(tokens) attn = attn.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, attn.size(1), -1) attn_buf.resize_as_(attn) bsz = new_bsz else: batch_idxs = None # set active_mask so that values > cand_size indicate eos hypos # and values < cand_size indicate candidate active hypos. # After, the min values per row are the top candidate active hypos active_mask = buffer('active_mask') torch.add( eos_mask.type_as(cand_offsets) * cand_size, cand_offsets[:eos_mask.size(1)], out=active_mask, ) # get the top beam_size active hypotheses, which are just the hypos # with the smallest values in active_mask active_hypos, _ignore = buffer('active_hypos'), buffer('_ignore') torch.topk(active_mask, k=beam_size, dim=1, largest=False, out=(_ignore, active_hypos)) active_bbsz_idx = buffer('active_bbsz_idx') torch.gather( cand_bbsz_idx, dim=1, index=active_hypos, out=active_bbsz_idx, ) active_scores = torch.gather( cand_scores, dim=1, index=active_hypos, out=scores[:, step].view(bsz, beam_size), ) active_bbsz_idx = active_bbsz_idx.view(-1) active_scores = active_scores.view(-1) # copy tokens and scores for active hypotheses torch.index_select( tokens[:, :step + 1], dim=0, index=active_bbsz_idx, out=tokens_buf[:, :step + 1], ) torch.gather( cand_indices, dim=1, index=active_hypos, out=tokens_buf.view(bsz, beam_size, -1)[:, :, step + 1], ) if step > 0: torch.index_select( scores[:, :step], dim=0, index=active_bbsz_idx, out=scores_buf[:, :step], ) torch.gather( cand_scores, dim=1, index=active_hypos, out=scores_buf.view(bsz, beam_size, -1)[:, :, step], ) # copy attention for active hypotheses torch.index_select( attn[:, :, :step + 2], dim=0, index=active_bbsz_idx, out=attn_buf[:, :, :step + 2], ) # swap buffers tokens, tokens_buf = tokens_buf, tokens scores, scores_buf = scores_buf, scores attn, attn_buf = attn_buf, attn # reorder incremental state in decoder reorder_state = active_bbsz_idx # sort by score descending for sent in range(len(finalized)): finalized[sent] = sorted(finalized[sent], key=lambda r: r['score'], reverse=True) return finalized
def EntropyLoss(self, input_): mask = input_.ge(0.000001) mask_out = torch.masked_select(input_, mask) entropy = -(torch.sum(mask_out * torch.log(mask_out))) return entropy / float(input_.size(0))
def ROIAlign(feature_maps, rois, config, pool_size, mode='bilinear'): """Implements ROI Align on the features. Params: - pool_shape: [height, width] of the output pooled regions. Usually [7, 7] - image_shape: [height, width, chanells]. Shape of input image in pixels Inputs: - boxes: [batch, num_boxes, (x1, y1, x2, y2)] in normalized coordinates. Possibly padded with zeros if not enough boxes to fill the array. - Feature maps: List of feature maps from different levels of the pyramid. Each is [batch, channels, height, width] Output: Pooled regions in the shape: [batch, num_boxes, height, width, channels]. The width and height are those specific in the pool_shape in the layer constructor. """ """ [ x2-x1 x1 + x2 - W + 1 ] [ ----- 0 --------------- ] [ W - 1 W - 1 ] [ ] [ y2-y1 y1 + y2 - H + 1 ] [ 0 ----- --------------- ] [ H - 1 H - 1 ] """ #feature_maps= [P2, P3, P4, P5] rois = rois.detach() crop_resize = CropAndResize(pool_size, pool_size, 0) roi_number = rois.size()[1] pooled = rois.data.new( config.IMAGES_PER_GPU*rois.size( 1), 256, pool_size, pool_size).zero_() rois = rois.view( config.IMAGES_PER_GPU*rois.size(1), 4) # Loop through levels and apply ROI pooling to each. P2 to P5. x_1 = rois[:, 0] y_1 = rois[:, 1] x_2 = rois[:, 2] y_2 = rois[:, 3] roi_level = log2_graph( torch.div(torch.sqrt((y_2 - y_1) * (x_2 - x_1)), 224.0)) roi_level = torch.clamp(torch.clamp( torch.add(torch.round(roi_level), 4), min=2), max=5) # P2 is 256x256, P3 is 128x128, P4 is 64x64, P5 is 32x32 # P2 is 4, P3 is 8, P4 is 16, P5 is 32 for i, level in enumerate(range(2, 6)): scaling_ratio = 2**level height = float(config.IMAGE_MAX_DIM)/ scaling_ratio width = float(config.IMAGE_MAX_DIM) / scaling_ratio ixx = torch.eq(roi_level, level) box_indices = ixx.view(-1).int() * 0 ix = torch.unsqueeze(ixx, 1) level_boxes = torch.masked_select(rois, ix) if level_boxes.size()[0] == 0: continue level_boxes = level_boxes.view(-1, 4) crops = crop_resize(feature_maps[i], torch.div( level_boxes, float(config.IMAGE_MAX_DIM) )[:, [1, 0, 3, 2]], box_indices) indices_pooled = ixx.nonzero()[:, 0] pooled[indices_pooled.data, :, :, :] = crops.data pooled = pooled.view(config.IMAGES_PER_GPU, roi_number, 256, pool_size, pool_size) pooled = Variable(pooled).cuda() return pooled
def train(model, optim, sche, db, opt, model_0): """ Args: model (torch.nn.module): the model to be trained optim (torch.optim.X): torch optimizer to be used db (torch.utils.data.Dataset): prepared tor ch dataset object opt: command line input from the user """ # for debug # outputs_A = [] # outputs_B = [] accuracy_history = [] if opt.active: # if active learning is enabled # Get c_A, c_B and Sc_A2B first # Prepare hooker to get layer features # We use this 2 aggregators for the whole file, so be careful 1) to empty them properly; 2) use only them as feature_maps aggregators def hook_A(module, input, output): outputs_A.append( output.to(torch.device("cpu")).detach().numpy().reshape( output.shape[0], -1)) def hook_B(module, input, output): outputs_B.append( output.to(torch.device("cpu")).detach().numpy().reshape( output.shape[0], -1)) if 'Alex'.lower() in opt.model_type.lower(): handleA = model.alex.features[-1].register_forward_hook(hook_A) handleB = model.alex.classifier[-3].register_forward_hook(hook_B) elif 'VGG16'.lower() in opt.model_type.lower(): handleA = model.vgg16.features[-1].register_forward_hook(hook_A) handleB = model.vgg16.classifier[-3].register_forward_hook(hook_B) # Get c_A, c_B, Sc_A2B embed_dir = path.join('../datasets/c_x_A_B', opt.model_type.lower()) if not (path.exists(embed_dir) and path.exists(path.join(embed_dir, 'c_A.npy')) and path.exists(path.join(embed_dir, 'c_B.npy'))): # create the directory you want to save to if not path.exists(embed_dir): os.makedirs(embed_dir) outputs_A = [] outputs_B = [] imagenet_loader = torch.utils.data.DataLoader( db['imagenet'], batch_size=opt.batch_size, shuffle=False) model.eval() for batch_idx, batch in enumerate(imagenet_loader): data = batch['image'] if opt.cuda: data = data.cuda() with torch.no_grad(): model(data) del data #assert len(outputs_A) == 1000 #assert len(outputs_B) == 1000 c_A = outputs_A = np.vstack(outputs_A) c_B = outputs_B = np.vstack(outputs_B) np.save(path.join(embed_dir, 'c_A.npy'), c_A) np.save(path.join(embed_dir, 'c_B.npy'), c_B) else: c_A = np.load(path.join(embed_dir, 'c_A.npy')) c_B = np.load(path.join(embed_dir, 'c_B.npy')) if not path.exists(path.join(embed_dir, 'Sc_A2B.npy')): ScA = dnu.Sx_generator(c_A, c_A) ScB = dnu.Sx_generator(c_B, c_B) Sc_A2B = ScA - ScB np.save(path.join(embed_dir, 'Sc_A2B.npy'), Sc_A2B) else: Sc_A2B = np.load(path.join(embed_dir, 'Sc_A2B.npy')) # Start fine-tuning (transfer learning) process! epoch is only 1 criterion = nn.CrossEntropyLoss() model_0.eval() if opt.alternate: current_class = 0 for epoch in range(1, opt.epochs + 1): #### Here, firstly, compute score and get active learning batch of size opt.active_batch_size n_samples = len(db['train']) # sample with replacement sampler = torch.utils.data.sampler.WeightedRandomSampler( np.ones(n_samples) / n_samples, n_samples) train_loader = torch.utils.data.DataLoader( db['train'], batch_size=opt.active_sample_size if opt.active else opt.batch_size, shuffle=False, sampler=sampler) # loader = torch.utils.data.DataLoader(db['eval'], batch_size=opt.eval_batch_size, shuffle=False, num_workers=4) # num_eval = len(db['eval']) # for batch_idx, batch in enumerate(loader): # if opt.eval: # evaluate(model, db, opt) # model.train() for batch_idx, batch in enumerate(train_loader): if batch_idx == 50: break data = batch['image'] target = batch['label'] if opt.cuda: with torch.no_grad(): data, target = data.cuda(), target.cuda() if opt.active: if opt.alternate: mask = target == current_class selected_target = torch.masked_select(target, mask) mask = mask.unsqueeze(1) if mask.sum() == 0: continue selected = torch.masked_select( data.view(opt.active_sample_size, -1), mask) selected = selected.view(mask.sum(), 3, 224, 224) data = selected target = selected_target current_class = 1 - current_class # extract feature maps and score the sampled batch outputs_A = [] outputs_B = [] model.eval() with torch.no_grad(): outputs = model(data) # assert len(outputs_A[0]) == opt.active_sample_size # assert len(outputs_B[0]) == opt.active_sample_size x_A = outputs_A[0] x_B = outputs_B[0] alpha = F.softmax(model_0(data), 1).to(torch.device("cpu")).detach().numpy() with torch.no_grad(): p = F.softmax(model(data), 1).to(torch.device("cpu")).detach().numpy() t = batch_idx # temperature for decaying lamb value btw distinctiveness & uncertainty best_indices = np.argsort( dnu.score(opt.lamb, t, p, alpha, x_A, x_B, c_A, c_B, Sc_A2B=Sc_A2B))[::-1] # best_indices = np.random.permutation(opt.active_sample_size) #### Secondly, fine-tune train the module # sche.step() model.train() # erase all computed gradient optim.zero_grad() # take data with maximum score if opt.active: outputs = model( data[best_indices[:opt.active_batch_size].tolist()]) loss = criterion( outputs, target[best_indices[:opt.active_batch_size].tolist()]) else: outputs = model(data) #_, preds = torch.max(outputs, 1) loss = criterion(outputs, target) # if batch_idx > 10: # print('debug') # compute gradient loss.backward() #train one step optim.step() if batch_idx % opt.report_every == 0: if opt.active: print( 'Train Epoch: {} [{}/{} ({:.0f}%)] Actively choosen {}\tLoss: {:.6f} ' .format(epoch, batch_idx * opt.active_sample_size, len(db['train']), 100. * batch_idx / len(train_loader), batch_idx * opt.active_batch_size, loss.data.item())) else: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f} '. format(epoch, batch_idx * opt.batch_size, len(db['train']), 100. * batch_idx / len(train_loader), loss.data.item())) # evaluate model if specified if opt.eval and batch_idx % opt.eval_every == 0: accuracy_history.append(evaluate(model, db, opt)) model.train() accuracy_history = np.array(accuracy_history) np.save( './history' + 'active_' + str(opt.active) + 'lambda_' + str(opt.lamb) + '_alternate_' + str(opt.alternate) + '.npy', accuracy_history) if opt.active: handleA.remove() handleB.remove()
def evaluate(dataset, train_steps=None): examples = processor.get_examples(data_dir, dataset) examples_dict = {e.guid: e for e in examples} features, tokenize_info = convert_examples_to_features(examples, max_seq_length, tokenizer, label_list) logger.info("***** Running Evaluation on %s set*****" % dataset) logger.info(" Num examples = %d", len(examples)) logger.info(" Num features = %d", len(features)) logger.info(" Batch size = %d", config[dataset]['batch_size']) data = create_tensor_data(features) sampler = SequentialSampler(data) dataloader = DataLoader(data, sampler=sampler, batch_size=config[dataset]['batch_size']) model.eval() predictions = [] predict_masks = [] nb_steps, nb_examples = 0, 0 loss, accuracy = 0, 0 for batch in tqdm(dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, predict_mask, label_ids = batch with torch.no_grad(): tmp_loss = model(input_ids, segment_ids, input_mask, predict_mask, label_ids) outputs, _ = model(input_ids, segment_ids, input_mask, predict_mask) if not config['task']['cal_X_loss']: reshaped_predict_mask, _, _ = valid_first(predict_mask) else: reshaped_predict_mask = predict_mask masked_label_ids = torch.masked_select(label_ids, predict_mask) masked_outputs = torch.masked_select(outputs, reshaped_predict_mask) masked_label_ids = masked_label_ids.cpu().numpy() masked_outputs = masked_outputs.detach().cpu().numpy() def cal_accuracy(outputs, labels): return np.sum(outputs == labels) tmp_accuracy = cal_accuracy(masked_outputs, masked_label_ids) predictions.extend(outputs.detach().cpu().numpy().tolist()) predict_masks.extend(reshaped_predict_mask.detach().cpu().numpy().tolist()) if config['n_gpu'] > 1: tmp_loss = tmp_loss.mean() # mean() to average on multi-gpu. loss += tmp_loss.item() accuracy += tmp_accuracy nb_examples += predict_mask.detach().cpu().numpy().sum() nb_steps += 1 loss = loss / nb_steps accuracy = accuracy / nb_examples logger.info('eval_loss: %.4f; eval_accuracy: %.4f' % (loss, accuracy)) if train_steps is not None: fn1 = "%s.predict_epoch_%s" % (dataset, train_steps) fn2 = "%s.mistake_epoch_%s" % (dataset, train_steps) else: fn1 = "%s.predict" % dataset fn2 = "%s.mistake" % dataset writer1 = codecs.open(os.path.join(config['task']['output_dir'], fn1), 'w', encoding='utf-8') writer2 = codecs.open(os.path.join(config['task']['output_dir'], fn2), 'w', encoding='utf-8') for feature, predict_line, predict_mask in zip(features, predictions, predict_masks): example = examples_dict[feature.ex_id] w1_sent = [] word_idx = feature.start_ix mistake = False for index, label_id in enumerate(predict_line[:sum(predict_mask)]): if example.words[word_idx] == '[SEP]': word_idx += 1 w1_sent.append("\n") line = ' '.join([example.words[word_idx], example.labels[word_idx], label_list[label_id]]) w1_sent.append(line) if label_list[label_id] != example.labels[word_idx]: mistake = True word_idx += 1 writer1.write('\n'.join(w1_sent) + '\n\n') if mistake: writer2.write('\n'.join(w1_sent) + '\n\n') writer1.close() writer2.close() return loss
def train(self): self.scheduler.step() self.loss.step() epoch = self.scheduler.last_epoch + 1 lr = self.scheduler.get_lr()[0] self.ckp.write_log('[Epoch {}]\tLearning rate: {:.2e}'.format( epoch, Decimal(lr))) self.loss.start_log() self.model.train() timer_data, timer_model = utility.timer(), utility.timer() for batch, (lr, hr, _, idx_scale) in enumerate(self.loader_train): lr, hr = self.prepare(lr, hr) timer_data.hold() timer_model.tic() N, C, H, W = lr.size() _, _, outH, outW = hr.size() scale_coord_map, mask = self.input_matrix_wpn( H, W, self.args.scale[idx_scale]) ### get the position matrix, mask if self.args.n_GPUs > 1: scale_coord_map = torch.cat([scale_coord_map] * self.args.n_GPUs, 0) else: scale_coord_map = scale_coord_map.cuda() self.optimizer.zero_grad() sr = self.model(lr, idx_scale, scale_coord_map) re_sr = torch.masked_select(sr, mask.cuda()) re_sr = re_sr.contiguous().view(N, C, outH, outW) loss = self.loss(re_sr, hr) if loss.item() < self.args.skip_threshold * self.error_last: loss.backward() self.optimizer.step() else: print('Skip this batch {}! (Loss: {})'.format( batch + 1, loss.item())) timer_model.hold() if (batch + 1) % self.args.print_every == 0: self.ckp.write_log('[{}/{}]\t{}\t{:.1f}+{:.1f}s'.format( (batch + 1) * self.args.batch_size, len(self.loader_train.dataset), self.loss.display_loss(batch), timer_model.release(), timer_data.release())) timer_data.tic() self.loss.end_log(len(self.loader_train)) self.error_last = self.loss.log[-1, -1] if self.args.n_GPUs == 1: target = self.model else: target = self.model #.module torch.save( target.state_dict(), os.path.join(self.ckp.dir, 'model', 'model_{}.pt'.format(epoch)))
def test_net(save_folder, net, dataset, thresh=0.05): num_images = len(dataset) all_boxes = [[[] for _ in range(num_images)] for _ in range(2)] _t = {'im_detect': Timer(), 'misc': Timer()} output_dir = get_output_dir(os.path.join(save_folder, 'sfd_hand'), set_type) det_file = os.path.join(output_dir, 'detections.pkl') for i in range(num_images): img = dataset.pull_image(i) h, w, _ = img.shape shrink = np.sqrt(1700 * 1200 / (img.shape[0] * img.shape[1])) image = cv2.resize(img, None, None, fx=shrink, fy=shrink, interpolation=cv2.INTER_LINEAR) x = to_chw_bgr(image) x = x.astype('float32') x -= cfg.img_mean x = x[[2, 1, 0], :, :] x = Variable(torch.from_numpy(x).unsqueeze(0)) if use_cuda: x = x.cuda() _t['im_detect'].tic() detections = net(x).data detect_time = _t['im_detect'].toc(average=False) for j in range(1, detections.size(1)): dets = detections[0, j, :] mask = dets[:, 0].gt(thresh).expand(5, dets.size(0)).t() dets = torch.masked_select(dets, mask).view(-1, 5) if dets.dim() == 0: continue boxes = dets[:, 1:] boxes[:, 0] *= w boxes[:, 2] *= w boxes[:, 1] *= h boxes[:, 3] *= h scores = dets[:, 0].cpu().numpy() cls_dets = np.hstack( (boxes.cpu().numpy(), scores[:, np.newaxis])).astype(np.float32, copy=False) all_boxes[j][i] = cls_dets fin_mask = np.where(scores > 0.6)[0] bboxes = boxes.cpu().numpy()[fin_mask] scores = scores[fin_mask] for k in range(len(scores)): leftup = (int(bboxes[k][0]), int(bboxes[k][1])) right_bottom = (int(bboxes[k][2]), int(bboxes[k][3])) cv2.rectangle(img, leftup, right_bottom, (0, 255, 0), 2) save_file = os.path.join(output_dir, '{}.jpg'.format(i + 1)) cv2.imwrite(save_file, img) print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1, num_images, detect_time)) with open(det_file, 'wb') as f: pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL) print('Evaluating detections') evaluate_detections(all_boxes, output_dir, dataset)
def test(self): epoch = self.scheduler.last_epoch + 1 self.ckp.write_log('\nEvaluation:') self.ckp.add_log(torch.zeros(1, len(self.scale))) self.model.eval() timer_test = utility.timer() with torch.no_grad(): for idx_scale, scale in enumerate(self.scale): eval_acc = 0 eval_acc_ssim = 0 self.loader_test.dataset.set_scale(idx_scale) #tqdm_test = tqdm(self.loader_test, ncols=80) for idx_img, (lr, hr, filename, _) in enumerate(self.loader_test): filename = filename[0] no_eval = (hr.nelement() == 1) if not no_eval: lr, hr = self.prepare(lr, hr) else: lr, = self.prepare(lr) N, C, H, W = lr.size() scale = self.args.scale[idx_scale] outH, outW = int(H * scale), int(W * scale) #_,_,outH,outW = hr.size() #timer_test.tic() scale_coord_map, mask = self.input_matrix_wpn( H, W, self.args.scale[idx_scale]) #position, mask = self.pos_matrix(H,W,self.args.scale[idx_scale]) #print(timer_test.toc()) if self.args.n_GPUs > 1: scale_coord_map = torch.cat([scale_coord_map] * self.args.n_GPUs, 0) else: scale_coord_map = scale_coord_map.cuda() timer_test.tic() sr = self.model(lr, idx_scale, scale_coord_map) timer_test.hold() re_sr = torch.masked_select(sr, mask.cuda()) sr = re_sr.contiguous().view(N, C, outH, outW) sr = utility.quantize(sr, self.args.rgb_range) #timer_test.hold() save_list = [sr] if not no_eval: eval_acc += utility.calc_psnr( sr, hr, scale, self.args.rgb_range, benchmark=self.loader_test.dataset.benchmark) eval_acc_ssim += utility.calc_ssim( sr, hr, scale, benchmark=self.loader_test.dataset.benchmark) save_list.extend([lr, hr]) if self.args.save_results: a = 1 self.ckp.save_results(filename, save_list, scale) self.ckp.log[-1, idx_scale] = eval_acc / len(self.loader_test) best = self.ckp.log.max(0) # print(timer_test.acc/100) self.ckp.write_log( '[{} x{}]\tPSNR: {:.3f} SSIM: {:.4f} (Best: {:.3f} @epoch {})' .format(self.args.data_test, scale, self.ckp.log[-1, idx_scale], eval_acc_ssim / len(self.loader_test), best[0][idx_scale], best[1][idx_scale] + 1)) print(timer_test.acc / 100) self.ckp.write_log('Total time: {:.2f}s\n'.format(timer_test.toc()), refresh=True) if not self.args.test_only: self.ckp.save(self, epoch, is_best=(best[1][0] + 1 == epoch))
def train(myNMT, args, lang1, lang2): # train model # myNMT (NMT model): model to train # args (a set of parameters): from parser # lang1 (Language class): source language # lang2 (Language class): target language myoptim = optim.Adam(myNMT.parameters(), lr=args.lr) training_data = [ IndicesFromPairs(p, lang1, lang2) for p in readPairs(args.source_training_file, args.target_training_file) ] # generate batches def generateBatches(data, batch_size): batches = [] batch = [] for i in range(len(data)): batch.append(data[i]) if len(batch) >= batch_size: batches.append(batch) batch = [] if batch != []: batches.append(batch) batch = [] return batches training_batches_pairs = generateBatches(training_data, args.batch_size) # transfer batches to padded Variables training_batches = [] source_len, target_len = [], [] for b in training_batches_pairs: source_batch = [ sentence[0] for sentence in b] target_batch = [ sentence[1] for sentence in b] source_len.append([len(s) for s in source_batch]) target_len.append([len(s) for s in target_batch]) max_len = source_len[-1][0] source_batch = [ s + [lang1.PAD_token] * (max_len - len(s)) for s in source_batch] max_len = max(target_len[-1]) target_batch = [ s + [lang2.PAD_token] * (max_len - len(s)) for s in target_batch] # mask for target sentence source_variable = ag.Variable(torch.LongTensor(source_batch)) target_variable = ag.Variable(torch.LongTensor(target_batch)) if args.gpu: source_variable = source_variable.cuda() target_variable = target_variable.cuda() training_batches.append((source_variable, target_variable)) for e in range(args.num_epoch): for i in range(len(training_batches)): source, target = training_batches[i] myoptim.zero_grad() loss = 0 criterion = nn.CrossEntropyLoss() # train network encoder_outputs, encoder_hidden = myNMT.encoder(source, source_len[i]) # encoder has bidirectional rnn, dimensions are different decoder_hidden = myNMT.decoder.init_hidden(encoder_hidden) batch_size, length = target.size() decoder_input = ag.Variable(torch.LongTensor([lang2.SOS_token] * target.size()[0])) if args.gpu: decoder_input = decoder_input.cuda() for j in range(length): decoder_output, decoder_hidden = myNMT.decoder(decoder_input, decoder_hidden, encoder_outputs) # compute loss with mask mask_tensor = torch.from_numpy((np.array(target_len[i]) > j).astype(np.int32)).byte() masked_index = ag.Variable(torch.masked_select(torch.arange(0, batch_size), mask_tensor).long()) if args.gpu: masked_index = masked_index.cuda() masked_outputs = torch.index_select(decoder_output, 0, masked_index) masked_targets = torch.index_select(target[:, j], 0, masked_index) loss += criterion(masked_outputs, masked_targets) decoder_input = target[:,j] loss = loss.div(sum(target_len[i])) loss.backward() torch.nn.utils.clip_grad_norm(myNMT.parameters(), args.clip) myoptim.step() print (time.strftime('%Hh %Mm %Ss', time.localtime()), " batch ", i) test = evaluate(myNMT, args.source_validation_file, args.target_validation_file, args, lang1, lang2) print (time.strftime('%Hh %Mm %Ss', time.localtime()), " epoch ", e, " evaluate accuracy ", test) print (time.strftime('%Hh %Mm %Ss', time.localtime()), " epoch ", e, " evaluate accuracy ", test, file=open(args.process_file, 'a')) torch.save(myNMT.state_dict(), args.weights_file+str(e))
def forward(self, q_data, qa_data, target, student_id=None): batch_size = q_data.shape[0] seqlen = q_data.shape[1] q_embed_data = self.q_embed(q_data) qa_embed_data = self.qa_embed(qa_data) memory_value = nn.Parameter( torch.cat([ self.init_memory_value.unsqueeze(0) for _ in range(batch_size) ], 0).data) self.mem.init_value_memory(memory_value) slice_q_data = torch.chunk(q_data, seqlen, 1) slice_q_embed_data = torch.chunk(q_embed_data, seqlen, 1) slice_qa_embed_data = torch.chunk(qa_embed_data, seqlen, 1) value_read_content_l = [] input_embed_l = [] predict_logs = [] for i in range(seqlen): ## Attention q = slice_q_embed_data[i].squeeze(1) correlation_weight = self.mem.attention(q) if_memory_write = slice_q_data[i].squeeze(1).ge(1) if_memory_write = utils.varible( torch.FloatTensor(if_memory_write.data.tolist()), 1) ## Read Process read_content = self.mem.read(correlation_weight) value_read_content_l.append(read_content) input_embed_l.append(q) ## Write Process qa = slice_qa_embed_data[i].squeeze(1) new_memory_value = self.mem.write(correlation_weight, qa, if_memory_write) # read_content_embed = torch.tanh(self.read_embed_linear(torch.cat([read_content, q], 1))) # pred = self.predict_linear(read_content_embed) # predict_logs.append(pred) all_read_value_content = torch.cat( [value_read_content_l[i].unsqueeze(1) for i in range(seqlen)], 1) input_embed_content = torch.cat( [input_embed_l[i].unsqueeze(1) for i in range(seqlen)], 1) # input_embed_content = input_embed_content.view(batch_size * seqlen, -1) # input_embed_content = torch.tanh(self.input_embed_linear(input_embed_content)) # input_embed_content = input_embed_content.view(batch_size, seqlen, -1) predict_input = torch.cat( [all_read_value_content, input_embed_content], 2) read_content_embed = torch.tanh( self.read_embed_linear(predict_input.view(batch_size * seqlen, -1))) pred = self.predict_linear(read_content_embed) # predicts = torch.cat([predict_logs[i] for i in range(seqlen)], 1) target_1d = target # [batch_size * seq_len, 1] mask = target_1d.ge(0) # [batch_size * seq_len, 1] # pred_1d = predicts.view(-1, 1) # [batch_size * seq_len, 1] pred_1d = pred.view(-1, 1) # [batch_size * seq_len, 1] filtered_pred = torch.masked_select(pred_1d, mask) filtered_target = torch.masked_select(target_1d, mask) loss = torch.nn.functional.binary_cross_entropy_with_logits( filtered_pred, filtered_target) return loss, torch.sigmoid(filtered_pred), filtered_target
def _generate( self, sample: Dict[str, Dict[str, Tensor]], prefix_tokens: Optional[Tensor] = None, constraints: Optional[Tensor] = None, bos_token: Optional[int] = None, ): incremental_states = torch.jit.annotate( List[Dict[str, Dict[str, Optional[Tensor]]]], [ torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {}) for i in range(self.model.models_size) ], ) net_input = sample["net_input"] if "src_tokens" in net_input: src_tokens = net_input["src_tokens"] # length of the source text being the character length except EndOfSentence and pad src_lengths = ((src_tokens.ne(self.eos) & src_tokens.ne(self.pad)).long().sum(dim=1)) elif "source" in net_input: src_tokens = net_input["source"] src_lengths = (net_input["padding_mask"].size(-1) - net_input["padding_mask"].sum(-1) if net_input["padding_mask"] is not None else torch.tensor(src_tokens.size(-1)).to(src_tokens)) else: raise Exception("expected src_tokens or source in net input") # bsz: total number of sentences in beam # Note that src_tokens may have more than 2 dimensions (i.e. audio features) bsz, src_len = src_tokens.size()[:2] beam_size = self.beam_size if constraints is not None and not self.search.supports_constraints: raise NotImplementedError( "Target-side constraints were provided, but search method doesn't support them" ) # Initialize constraints, when active self.search.init_constraints(constraints, beam_size) max_len: int = -1 if self.match_source_len: max_len = src_lengths.max().item() else: max_len = min( int(self.max_len_a * src_len + self.max_len_b), # exclude the EOS marker self.model.max_decoder_positions() - 1, ) assert ( self.min_len <= max_len ), "min_len cannot be larger than max_len, please adjust these!" # compute the encoder output for each beam encoder_outs = self.model.forward_encoder(net_input) # placeholder of indices for bsz * beam_size to hold tokens and accumulative scores new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1) new_order = new_order.to(src_tokens.device).long() encoder_outs = self.model.reorder_encoder_out(encoder_outs, new_order) # ensure encoder_outs is a List. assert encoder_outs is not None # initialize buffers scores = (torch.zeros(bsz * beam_size, max_len + 1).to(src_tokens).float() ) # +1 for eos; pad is never chosen for scoring tokens = (torch.zeros(bsz * beam_size, max_len + 2).to(src_tokens).long().fill_( self.pad)) # +2 for eos and pad tokens[:, 0] = self.eos if bos_token is None else bos_token attn: Optional[Tensor] = None # A list that indicates candidates that should be ignored. # For example, suppose we're sampling and have already finalized 2/5 # samples. Then cands_to_ignore would mark 2 positions as being ignored, # so that we only finalize the remaining 3 samples. cands_to_ignore = (torch.zeros(bsz, beam_size).to(src_tokens).eq(-1) ) # forward and backward-compatible False mask # list of completed sentences finalized = torch.jit.annotate( List[List[Dict[str, Tensor]]], [ torch.jit.annotate(List[Dict[str, Tensor]], []) for i in range(bsz) ], ) # contains lists of dictionaries of infomation about the hypothesis being finalized at each step finished = [ False for i in range(bsz) ] # a boolean array indicating if the sentence at the index is finished or not num_remaining_sent = bsz # number of sentences remaining # number of candidate hypos per step cand_size = 2 * beam_size # 2 x beam size in case half are EOS # offset arrays for converting between different indexing schemes bbsz_offsets = ((torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens).to( src_tokens.device)) cand_offsets = torch.arange(0, cand_size).type_as(tokens).to( src_tokens.device) reorder_state: Optional[Tensor] = None batch_idxs: Optional[Tensor] = None original_batch_idxs: Optional[Tensor] = None if "id" in sample and isinstance(sample["id"], Tensor): original_batch_idxs = sample["id"] else: original_batch_idxs = torch.arange(0, bsz).type_as(tokens) for step in range(max_len + 1): # one extra step for EOS marker # reorder decoder internal states based on the prev choice of beams if reorder_state is not None: if batch_idxs is not None: # update beam indices to take into account removed sentences corr = batch_idxs - torch.arange( batch_idxs.numel()).type_as(batch_idxs) reorder_state.view(-1, beam_size).add_( corr.unsqueeze(-1) * beam_size) original_batch_idxs = original_batch_idxs[batch_idxs] self.model.reorder_incremental_state(incremental_states, reorder_state) encoder_outs = self.model.reorder_encoder_out( encoder_outs, reorder_state) lprobs, avg_attn_scores = self.model.forward_decoder( tokens[:, :step + 1], encoder_outs, incremental_states, self.temperature, ) if self.lm_model is not None: lm_out = self.lm_model(tokens[:, :step + 1]) probs = self.lm_model.get_normalized_probs(lm_out, log_probs=True, sample=None) probs = probs[:, -1, :] * self.lm_weight lprobs += probs lprobs[lprobs != lprobs] = torch.tensor(-math.inf).to(lprobs) lprobs[:, self.pad] = -math.inf # never select pad lprobs[:, self.unk] -= self.unk_penalty # apply unk penalty # handle max length constraint if step >= max_len: lprobs[:, :self.eos] = -math.inf lprobs[:, self.eos + 1:] = -math.inf # handle prefix tokens (possibly with different lengths) if (prefix_tokens is not None and step < prefix_tokens.size(1) and step < max_len): lprobs, tokens, scores = self._prefix_tokens( step, lprobs, scores, tokens, prefix_tokens, beam_size) elif step < self.min_len: # minimum length constraint (does not apply if using prefix_tokens) lprobs[:, self.eos] = -math.inf # Record attention scores, only support avg_attn_scores is a Tensor if avg_attn_scores is not None: if attn is None: attn = torch.empty(bsz * beam_size, avg_attn_scores.size(1), max_len + 2).to(scores) attn[:, :, step + 1].copy_(avg_attn_scores) scores = scores.type_as(lprobs) eos_bbsz_idx = torch.empty(0).to( tokens ) # indices of hypothesis ending with eos (finished sentences) eos_scores = torch.empty(0).to( scores ) # scores of hypothesis ending with eos (finished sentences) if self.should_set_src_lengths: self.search.set_src_lengths(src_lengths) if self.repeat_ngram_blocker is not None: lprobs = self.repeat_ngram_blocker(tokens, lprobs, bsz, beam_size, step) # Shape: (batch, cand_size) cand_scores, cand_indices, cand_beams = self.search.step( step, lprobs.view(bsz, -1, self.vocab_size), scores.view(bsz, beam_size, -1)[:, :, :step], tokens[:, :step + 1], original_batch_idxs, ) # cand_bbsz_idx contains beam indices for the top candidate # hypotheses, with a range of values: [0, bsz*beam_size), # and dimensions: [bsz, cand_size] cand_bbsz_idx = cand_beams.add(bbsz_offsets) # finalize hypotheses that end in eos # Shape of eos_mask: (batch size, beam size) eos_mask = cand_indices.eq(self.eos) & cand_scores.ne(-math.inf) eos_mask[:, :beam_size][cands_to_ignore] = torch.tensor(0).to( eos_mask) # only consider eos when it's among the top beam_size indices # Now we know what beam item(s) to finish # Shape: 1d list of absolute-numbered eos_bbsz_idx = torch.masked_select(cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size]) finalized_sents: List[int] = [] if eos_bbsz_idx.numel() > 0: eos_scores = torch.masked_select(cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size]) finalized_sents = self.finalize_hypos( step, eos_bbsz_idx, eos_scores, tokens, scores, finalized, finished, beam_size, attn, src_lengths, max_len, ) num_remaining_sent -= len(finalized_sents) assert num_remaining_sent >= 0 if num_remaining_sent == 0: break if self.search.stop_on_max_len and step >= max_len: break assert step < max_len, f"{step} < {max_len}" # Remove finalized sentences (ones for which {beam_size} # finished hypotheses have been generated) from the batch. if len(finalized_sents) > 0: new_bsz = bsz - len(finalized_sents) # construct batch_idxs which holds indices of batches to keep for the next pass batch_mask = torch.ones(bsz, dtype=torch.bool, device=cand_indices.device) batch_mask[finalized_sents] = False # TODO replace `nonzero(as_tuple=False)` after TorchScript supports it batch_idxs = torch.arange( bsz, device=cand_indices.device).masked_select(batch_mask) # Choose the subset of the hypothesized constraints that will continue self.search.prune_sentences(batch_idxs) eos_mask = eos_mask[batch_idxs] cand_beams = cand_beams[batch_idxs] bbsz_offsets.resize_(new_bsz, 1) cand_bbsz_idx = cand_beams.add(bbsz_offsets) cand_scores = cand_scores[batch_idxs] cand_indices = cand_indices[batch_idxs] if prefix_tokens is not None: prefix_tokens = prefix_tokens[batch_idxs] src_lengths = src_lengths[batch_idxs] cands_to_ignore = cands_to_ignore[batch_idxs] scores = scores.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, -1) tokens = tokens.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, -1) if attn is not None: attn = attn.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, attn.size(1), -1) bsz = new_bsz else: batch_idxs = None # Set active_mask so that values > cand_size indicate eos hypos # and values < cand_size indicate candidate active hypos. # After, the min values per row are the top candidate active hypos # Rewrite the operator since the element wise or is not supported in torchscript. eos_mask[:, :beam_size] = ~((~cands_to_ignore) & (~eos_mask[:, :beam_size])) active_mask = torch.add( eos_mask.type_as(cand_offsets) * cand_size, cand_offsets[:eos_mask.size(1)], ) # get the top beam_size active hypotheses, which are just # the hypos with the smallest values in active_mask. # {active_hypos} indicates which {beam_size} hypotheses # from the list of {2 * beam_size} candidates were # selected. Shapes: (batch size, beam size) new_cands_to_ignore, active_hypos = torch.topk(active_mask, k=beam_size, dim=1, largest=False) # update cands_to_ignore to ignore any finalized hypos. cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size] # Make sure there is at least one active item for each sentence in the batch. assert (~cands_to_ignore).any(dim=1).all() # update cands_to_ignore to ignore any finalized hypos # {active_bbsz_idx} denotes which beam number is continued for each new hypothesis (a beam # can be selected more than once). active_bbsz_idx = torch.gather(cand_bbsz_idx, dim=1, index=active_hypos) active_scores = torch.gather(cand_scores, dim=1, index=active_hypos) active_bbsz_idx = active_bbsz_idx.view(-1) active_scores = active_scores.view(-1) # copy tokens and scores for active hypotheses # Set the tokens for each beam (can select the same row more than once) tokens[:, :step + 1] = torch.index_select(tokens[:, :step + 1], dim=0, index=active_bbsz_idx) # Select the next token for each of them tokens.view(bsz, beam_size, -1)[:, :, step + 1] = torch.gather(cand_indices, dim=1, index=active_hypos) if step > 0: scores[:, :step] = torch.index_select(scores[:, :step], dim=0, index=active_bbsz_idx) scores.view(bsz, beam_size, -1)[:, :, step] = torch.gather(cand_scores, dim=1, index=active_hypos) # Update constraints based on which candidates were selected for the next beam self.search.update_constraints(active_hypos) # copy attention for active hypotheses if attn is not None: attn[:, :, :step + 2] = torch.index_select( attn[:, :, :step + 2], dim=0, index=active_bbsz_idx) # reorder incremental state in decoder reorder_state = active_bbsz_idx # sort by score descending for sent in range(len(finalized)): scores = torch.tensor( [float(elem["score"].item()) for elem in finalized[sent]]) _, sorted_scores_indices = torch.sort(scores, descending=True) finalized[sent] = [ finalized[sent][ssi] for ssi in sorted_scores_indices ] finalized[sent] = torch.jit.annotate(List[Dict[str, Tensor]], finalized[sent]) return finalized
def forward(self, input, target, hidden, sent_lens, ce=False, noise=None): emb = self.emblayer(input) if (self.use_cell): if (self.use_rnn): hidden_output = torch.randn(emb.size(1), self.hidden_size).to(device) hidden_outputs = [] for x in emb: hidden_output = self.rnnlayer(x, hidden) hidden_outputs.append(hidden_output) hidden_outputs = pack_padded_sequence( torch.stack(hidden_outputs), sent_lens) else: hidden_output = torch.randn(emb.size(1), self.hidden_size).to(device) hidden_outputs = [] for x in emb: hidden_output, hidden = self.rnnlayer( x, (hidden_output, hidden)) hidden_outputs.append(hidden_output) hidden_outputs = pack_padded_sequence( torch.stack(hidden_outputs), sent_lens) elif (self.use_rnn_only): emb = pack_padded_sequence(emb, sent_lens) hidden_outputs, hidden = self.rnnlayer(emb, hidden) else: emb = pack_padded_sequence(emb, sent_lens) hidden_outputs, hidden = self.rnnlayer(emb, hidden) ''' CE traing ''' if self.ce is True or ce is True: output = F.linear(hidden_outputs[0], self.weight, self.bias) # output = self.outlayer(hidden_outputs[0]) # ''' NCE training ''' elif self.nce is True: ''' target size: seq_len, minibatch noise size: seq_len, nsample indices size: seq_len, minibatch+nsample input size: seq_len, minibatch, nhidden ''' minibatch = target.size(-1) indices = torch.cat([target, noise], dim=-1) hidden_outputs = pad_packed_sequence(hidden_outputs)[0] hidden_outputs = hidden_outputs.contiguous() ''' weight size: seq_len, nhidden, minibatch+nsample bias size: seq_len, 1, minibatch+nsample ''' weight = self.weight.index_select(0, indices.view(-1)).view( *indices.size(), -1).transpose(1, 2) bias = self.bias.index_select( 0, indices.view(-1)).view_as(indices).unsqueeze(1) ''' out size: seq_len, minibatch, minibatch+nsample target_score size: seq_len, minibatch, minibatch noise_score size: seq_len, minibatch, nsample ''' out = torch.baddbmm(1, bias, 1, hidden_outputs, weight) target_score, noise_score = out[:, :, :minibatch], out[:, :, minibatch:] target_score = target_score.sub(self.lognormconst).exp() noise_score = noise_score.sub(self.lognormconst).exp() target_score = target_score.contiguous() noise_score = noise_score.contiguous() ''' target_score size: seq_len, minibatch target_noise_prob size: seq_len, minibatch noise_noise_prob size: seq_len, minibatch, nsample ''' index_slice = torch.arange( 0, target_score.size(1) * target_score.size(2), target_score.size(1)).long() for i, v in enumerate(index_slice): index_slice[i] = index_slice[i] + i target_score = target_score.view(target_score.size(0), -1).contiguous() target_score = target_score[:, index_slice] ## target_score = target_score.view(target_score.size(0), -1)[:, index_slice] target_noise_prob = self.noiseprob[target.view(-1)].view_as( target_score) noise_noise_prob = self.noiseprob[noise.view(-1)].view_as( noise).unsqueeze(1).expand_as(noise_score) model_loss = self.safe_log( target_score / (target_score + self.ncesample * target_noise_prob)) noise_loss = torch.sum( self.safe_log( (self.ncesample * noise_noise_prob) / (noise_score + self.ncesample * noise_noise_prob)), -1).squeeze() loss = -(model_loss + noise_loss) mask = input.gt(0.1) mask[0, :] = 1 loss = torch.masked_select(loss, mask) return loss.mean() else: print('need to be either ce or nce loss') exit() return output