def unet_like_1d(x, usual_convolution): # u-net like steps for increasing / reducing dimensionality x = rearrange(x, 'b c t1 t2 -> b c (t1 t2)') # reduce dimensionality y = rearrange(x, 'b c (t dt) -> b (dt c) t', dt=2) y = usual_convolution(y) x = x + rearrange(y, 'b (dt c) t -> b c (t dt)', dt=2) return x
def forward(self, img: FloatTensor, img_mask: LongTensor) -> Tuple[FloatTensor, LongTensor]: """encode image to feature Parameters ---------- img : FloatTensor [b, 1, h', w'] img_mask: LongTensor [b, h', w'] Returns ------- Tuple[FloatTensor, LongTensor] [b, t, d], [b, t] """ # extract feature feature, mask = self.model(img, img_mask) feature = self.feature_proj(feature) # proj feature = rearrange(feature, "b d h w -> b h w d") feature = self.norm(feature) # positional encoding feature = self.pos_enc_2d(feature, mask) # flat to 1-D feature = rearrange(feature, "b h w d -> b (h w) d") mask = rearrange(mask, "b h w -> b (h w)") return feature, mask
def new_way(input, num_classes, num_anchors, anchors, stride_h, stride_w): raw_predictions = rearrange( input, ' b (anchor prediction) h w -> prediction b anchor h w', anchor=num_anchors) anchors = torch.FloatTensor(anchors).to(input.device) anchor_sizes = rearrange(anchors, 'anchor dim -> dim () anchor () ()') _, _, _, in_h, in_w = raw_predictions.shape grid_h = rearrange(torch.arange(in_h).float(), 'h -> () () h ()').to(input.device) grid_w = rearrange(torch.arange(in_w).float(), 'w -> () () () w').to(input.device) predicted_bboxes = torch.zeros_like(raw_predictions) predicted_bboxes[0] = (raw_predictions[0].sigmoid() + grid_h) * stride_h # center y predicted_bboxes[1] = (raw_predictions[1].sigmoid() + grid_w) * stride_w # center x predicted_bboxes[2:4] = ( raw_predictions[2:4].exp()) * anchor_sizes # bbox width and height predicted_bboxes[4] = raw_predictions[4].sigmoid() # confidence predicted_bboxes[5:] = raw_predictions[5:].sigmoid( ) # class predictions # only to match results of original code, not needed return rearrange(predicted_bboxes, 'prediction b anchor h w -> b anchor h w prediction')
def test_rearrange_permutations_numpy(): # tests random permutation of axes against two independent numpy ways for n_axes in range(1, 10): input = numpy.arange(2**n_axes).reshape([2] * n_axes) permutation = numpy.random.permutation(n_axes) left_expression = ' '.join('i' + str(axis) for axis in range(n_axes)) right_expression = ' '.join('i' + str(axis) for axis in permutation) expression = left_expression + ' -> ' + right_expression result = rearrange(input, expression) for pick in numpy.random.randint(0, 2, [10, n_axes]): assert input[tuple(pick)] == result[tuple(pick[permutation])] for n_axes in range(1, 10): input = numpy.arange(2**n_axes).reshape([2] * n_axes) permutation = numpy.random.permutation(n_axes) left_expression = ' '.join('i' + str(axis) for axis in range(n_axes)[::-1]) right_expression = ' '.join('i' + str(axis) for axis in permutation[::-1]) expression = left_expression + ' -> ' + right_expression result = rearrange(input, expression) assert result.shape == input.shape expected_result = numpy.zeros_like(input) for original_axis, result_axis in enumerate(permutation): expected_result |= ((input >> original_axis) & 1) << result_axis assert numpy.array_equal(result, expected_result)
def forward(self, feat_f0, feat_f1, feat_c0, feat_c1, data): W = self.W stride = data['hw0_f'][0] // data['hw0_c'][0] data.update({'W': W}) if data['b_ids'].shape[0] == 0: feat0 = torch.empty(0, self.W**2, self.d_model_f, device=feat_f0.device) feat1 = torch.empty(0, self.W**2, self.d_model_f, device=feat_f0.device) return feat0, feat1 # 1. unfold(crop) all local windows feat_f0_unfold = F.unfold(feat_f0, kernel_size=(W, W), stride=stride, padding=W // 2) feat_f0_unfold = rearrange(feat_f0_unfold, 'n (c ww) l -> n l ww c', ww=W**2) feat_f1_unfold = F.unfold(feat_f1, kernel_size=(W, W), stride=stride, padding=W // 2) feat_f1_unfold = rearrange(feat_f1_unfold, 'n (c ww) l -> n l ww c', ww=W**2) # 2. select only the predicted matches feat_f0_unfold = feat_f0_unfold[data['b_ids'], data['i_ids']] # [n, ww, cf] feat_f1_unfold = feat_f1_unfold[data['b_ids'], data['j_ids']] # option: use coarse-level loftr feature as context: concat and linear if self.cat_c_feat: feat_c_win = self.down_proj( torch.cat([ feat_c0[data['b_ids'], data['i_ids']], feat_c1[data['b_ids'], data['j_ids']] ], 0)) # [2n, c] feat_cf_win = self.merge_feat( torch.cat( [ torch.cat([feat_f0_unfold, feat_f1_unfold], 0), # [2n, ww, cf] repeat(feat_c_win, 'n c -> n ww c', ww=W** 2), # [2n, ww, cf] ], -1)) feat_f0_unfold, feat_f1_unfold = torch.chunk(feat_cf_win, 2, dim=0) return feat_f0_unfold, feat_f1_unfold
def test6(x): # parsing parameters t = rearrange(x, 'b c h w -> (b h w) c') t = t[:, :: 2] # replacement for dot-product, just changes size of second axis assert t.shape == (10 * 30 * 40, 10) y = rearrange(t, '(b h w) c2 -> b c2 h w', **parse_shape(x, 'b _ h w')) assert y.shape == (10, 10, 30, 40) return y
def convolve_strided_2d(x, h_stride, w_stride, usual_convolution): x = rearrange(x, 'b c (h hs) (w ws) -> (hs ws b) c h w', hs=h_stride, ws=w_stride) x = usual_convolution(x) x = rearrange(x, '(hs ws b) c h w -> b c (h hs) (w ws)', hs=h_stride, ws=w_stride) return x
def test9(x): # squeeze - unsqueeze y = reduce(x, 'b c h w -> b c () ()', reduction='max') assert y.shape == (10, 20, 1, 1) y = rearrange(y, 'b c () () -> c b') assert y.shape == (20, 10) return y
def forward(self, x): b, c, h, w = x.shape cls_tokens = repeat(self.CLS, "1 1 d -> b 1 d", b=b) # Divide into flattened patches x = self.patch_and_flat(x) # Linear projection x = self.linear_proj(x) # Token dropout x = self.token_dropout(x) # Concatenate CLS if not using multihead attention pooling if not self.use_multihead_attention_pooling: x = torch.cat([cls_tokens, x], dim=1) + self.position_code # Transformer x = self.transformer(x) # Use multihead attention pooling if specified if self.use_multihead_attention_pooling: cls_tokens = self.map(cls_tokens, x) cls_tokens = rearrange(cls_tokens, "b 1 d -> b d") else: cls_tokens = x.select(dim=1, index=0) cls_tokens = self.proj(cls_tokens) return cls_tokens
def test_parse_shape_symbolic(): backends = collect_test_backends(symbolic=True, layers=False) backends += collect_test_backends(symbolic=True, layers=True) for backend in backends: if backend.framework_name == 'keras': # need special way to compile, shape vars can be used only inside layers continue print('special shape parsing for', backend.framework_name) input_symbols = [ backend.create_symbol([10, 20, 30, 40]), backend.create_symbol([10, 20, None, None]), backend.create_symbol([None, None, None, None]), ] if backend.framework_name in ['mxnet.symbol']: # mxnet can't normally run inference input_symbols = [backend.create_symbol([10, 20, 30, 40])] for input_symbol in input_symbols: shape_placeholder = parse_shape(input_symbol, 'a b c d') shape = {} for name, symbol in shape_placeholder.items(): shape[name] = symbol if isinstance(symbol, int) \ else backend.eval_symbol(symbol, [(input_symbol, numpy.zeros([10, 20, 30, 40]))]) print(shape) result_placeholder = rearrange( input_symbol, 'a b (c1 c2) (d1 d2) -> (a b d1) c1 (c2 d2)', **parse_shape(input_symbol, 'a b c1 _'), d2=2) result = backend.eval_symbol( result_placeholder, [(input_symbol, numpy.zeros([10, 20, 30, 40]))]) print(result.shape) assert result.shape == (10 * 20 * 20, 30, 1 * 2) assert numpy.allclose(result, 0)
def forward(self, k, q, nbhd_idx): # (bs, m, c_in) -> (bs, m, embed_dim) -> (bs * n_heads, m, h_dim) K = rearrange(self.fc_k(k), "b n (h d) -> b n h d", h=self.n_heads) # (bs, n, c_in) -> (bs, n, embed_dim) -> (bs * n_heads, n, h_dim) Q = rearrange(self.fc_q(q), "b n (h d) -> b n h d", h=self.n_heads) # Key features are just the same for each point K = K.unsqueeze(2).repeat(1, 1, nbhd_idx.shape[2], 1, 1) # Batch indices B = (torch.arange( Q.shape[0], device=Q.device).long()[:, None, None].expand(*nbhd_idx.shape)) # Extract the points for each nbhd Q = Q[B, nbhd_idx] # Concat and return return self.fc_o(torch.cat([K, Q], dim=-1))
def forward(self, data): """ Update: data (dict): { 'image0': (torch.Tensor): (N, 1, H, W) 'image1': (torch.Tensor): (N, 1, H, W) 'mask0'(optional) : (torch.Tensor): (N, H, W) '0' indicates a padded position 'mask1'(optional) : (torch.Tensor): (N, H, W) } """ # 1. Local Feature CNN data.update({ 'bs': data['image0'].size(0), 'hw0_i': data['image0'].shape[2:], 'hw1_i': data['image1'].shape[2:] }) if data['hw0_i'] == data['hw1_i']: # faster & better BN convergence feats_c, feats_f = self.backbone(torch.cat([data['image0'], data['image1']], dim=0)) (feat_c0, feat_c1), (feat_f0, feat_f1) = feats_c.split(data['bs']), feats_f.split(data['bs']) else: # handle different input shapes (feat_c0, feat_f0), (feat_c1, feat_f1) = self.backbone(data['image0']), self.backbone(data['image1']) data.update({ 'hw0_c': feat_c0.shape[2:], 'hw1_c': feat_c1.shape[2:], 'hw0_f': feat_f0.shape[2:], 'hw1_f': feat_f1.shape[2:] }) # 2. coarse-level loftr module # add featmap with positional encoding, then flatten it to sequence [N, HW, C] feat_c0 = rearrange(self.pos_encoding(feat_c0), 'n c h w -> n (h w) c') feat_c1 = rearrange(self.pos_encoding(feat_c1), 'n c h w -> n (h w) c') mask_c0 = mask_c1 = None # mask is useful in training if 'mask0' in data: mask_c0, mask_c1 = data['mask0'].flatten(-2), data['mask1'].flatten(-2) feat_c0, feat_c1 = self.loftr_coarse(feat_c0, feat_c1, mask_c0, mask_c1) # 3. match coarse-level self.coarse_matching(feat_c0, feat_c1, data, mask_c0=mask_c0, mask_c1=mask_c1) # 4. fine-level refinement feat_f0_unfold, feat_f1_unfold = self.fine_preprocess(feat_f0, feat_f1, feat_c0, feat_c1, data) if feat_f0_unfold.size(0) != 0: # at least one coarse level predicted feat_f0_unfold, feat_f1_unfold = self.loftr_fine(feat_f0_unfold, feat_f1_unfold) # 5. match fine-level self.fine_matching(feat_f0_unfold, feat_f1_unfold, data)
def test_collapsed_ellipsis_errors_out(): x = numpy.zeros([1, 1, 1, 1, 1]) rearrange(x, 'a b c d ... -> a b c ... d') with assert_raises(EinopsError): rearrange(x, 'a b c d (...) -> a b c ... d') rearrange(x, '... -> (...)') with assert_raises(EinopsError): rearrange(x, '(...) -> (...)')
def test10(x): # stack tensors = list( x + 0 ) # 0 is needed https://github.com/tensorflow/tensorflow/issues/23185 tensors = rearrange(tensors, 'b c h w -> b h w c') assert tensors.shape == (10, 30, 40, 20) return tensors
def test11(x): # concatenate tensors = list( x + 0 ) # 0 is needed https://github.com/tensorflow/tensorflow/issues/23185 tensors = rearrange(tensors, 'b c h w -> h (b w) c') assert tensors.shape == (30, 10 * 40, 20) return tensors
def test_ellipsis_ops_numpy(): x = numpy.arange(2 * 3 * 4 * 5 * 6).reshape([2, 3, 4, 5, 6]) for pattern in identity_patterns: assert numpy.array_equal(x, rearrange(x, pattern)), pattern for pattern1, pattern2 in equivalent_rearrange_patterns: assert numpy.array_equal(rearrange(x, pattern1), rearrange(x, pattern2)) for reduction in ['min', 'max', 'sum']: for pattern1, pattern2 in equivalent_reduction_patterns: assert numpy.array_equal(reduce(x, pattern1, reduction=reduction), reduce(x, pattern2, reduction=reduction)) # now just check coincidence with numpy all_rearrange_patterns = [*identity_patterns] for pattern_pairs in equivalent_rearrange_patterns: all_rearrange_patterns.extend(pattern_pairs)
def ensemble_cross_rate_score( self, src_mask_list: List[Tuple[torch.Tensor, torch.Tensor]], hypotheses: List[Hypothesis], direction: str, ) -> None: """give hypotheses to another model, add score to hypotheses inplace Args: src_mask_list: [([1, len, d_model], [1, len])] hypotheses (List[Hypothesis]): direction (str): one of {"l2r", "r2l"} """ indices = [h.seq for h in hypotheses] tgt, output = to_tgt_output(indices, direction, self.device) b, length = tgt.size() prob_sum = torch.zeros((b, length, vocab_size), dtype=torch.float, device=self.device) for i, m in enumerate(self.models): src, src_mask = src_mask_list[i] exp_src = repeat(src.squeeze(0), "s e -> b s e", b=b) exp_src_mask = repeat(src_mask.squeeze(0), "s -> b s", b=b) output_hat = m.bttr.decoder(exp_src, exp_src_mask, tgt) prob_sum = prob_sum + torch.softmax(output_hat, dim=-1) log_p = torch.log(prob_sum / len(self.models)) flat_hat = rearrange(log_p, "b l e -> (b l) e") flat = rearrange(output, "b l -> (b l)") loss = F.nll_loss(flat_hat, flat, ignore_index=vocab.PAD_IDX, reduction="none") loss = rearrange(loss, "(b l) -> b l", b=b) loss = torch.sum(loss, dim=-1) for i, length in enumerate(loss): score = -length hypotheses[i].score += score
def get_fine_windows_prediction(self, desc_c1, desc_c2, desc_f1, desc_f2, matches, data): stride = data['fine_size_1'][0] // data['coarse_size_1'][0] if matches['b_ids'].shape[0] == 0: feat_f1_unfold = torch.empty(0, self.win_size**2, data["dim_f"], device=desc_c1.device) feat_f2_unfold = torch.empty(0, self.win_size**2, data["dim_f"], device=desc_c2.device) else: # 1. unfold(crop) all local windows feat_f1_unfold = functional.unfold(desc_f1, kernel_size=(self.win_size, self.win_size), stride=stride, padding=self.win_size // 2) feat_f2_unfold = functional.unfold(desc_f2, kernel_size=(self.win_size, self.win_size), stride=stride, padding=self.win_size // 2) feat_f1_unfold = rearrange(feat_f1_unfold, 'n (c ww) l -> n l ww c', ww=self.win_size**2) feat_f2_unfold = rearrange(feat_f2_unfold, 'n (c ww) l -> n l ww c', ww=self.win_size**2) # 2. select only the predicted matches feat_f1_unfold = feat_f1_unfold[matches['b_ids'], matches['i_ids']] # [n, ww, cf] feat_f2_unfold = feat_f2_unfold[matches['b_ids'], matches['j_ids']] return feat_f1_unfold, feat_f2_unfold
def tensor_train_example_numpy(): # kept here just for a collection, only tested for numpy # https://arxiv.org/pdf/1509.06569.pdf, (5) x = numpy.ones([3, 4, 5, 6]) rank = 4 if numpy.__version__ < '1.15.0': # numpy.einsum fails here, skip test return # creating appropriate Gs Gs = [numpy.ones([d, d, rank, rank]) for d in x.shape] Gs[0] = Gs[0][:, :, :1, :] Gs[-1] = Gs[-1][:, :, :, :1] # einsum way y = x.reshape((1, ) + x.shape) for G in Gs: # taking partial results left-to-right # y = numpy.einsum('i j alpha beta, alpha i ... -> beta ... j', G, y) y = numpy.einsum('i j a b, a i ... -> b ... j', G, y) y1 = y.reshape(-1) # alternative way y = x.reshape(-1) for G in Gs: i, j, alpha, beta = G.shape y = rearrange(y, '(i rest alpha) -> rest (alpha i)', alpha=alpha, i=i) y = y @ rearrange(G, 'i j alpha beta -> (alpha i) (j beta)') y = rearrange(y, 'rest (beta j) -> (beta rest j)', beta=beta, j=j) y2 = y assert numpy.allclose(y1, y2) # yet another way y = x for G in Gs: i, j, alpha, beta = G.shape y = rearrange(y, 'i ... (j alpha) -> ... j (alpha i)', alpha=alpha, i=i) y = y @ rearrange(G, 'i j alpha beta -> (alpha i) (j beta)') y3 = y.reshape(-1) assert numpy.allclose(y1, y3)
def test_rearrange_consistency_numpy(): shape = [1, 2, 3, 5, 7, 11] x = numpy.arange(numpy.prod(shape)).reshape(shape) for pattern in [ 'a b c d e f -> a b c d e f', 'b a c d e f -> a b d e f c', 'a b c d e f -> f e d c b a', 'a b c d e f -> (f e) d (c b a)', 'a b c d e f -> (f e d c b a)', ]: result = rearrange(x, pattern) assert len(numpy.setdiff1d(x, result)) == 0 assert result.dtype == x.dtype result = rearrange(x, 'a b c d e f -> a (b) (c d e) f') assert numpy.array_equal(x.flatten(), result.flatten()) result = rearrange(x, 'a aa aa1 a1a1 aaaa a11 -> a aa aa1 a1a1 aaaa a11') assert numpy.array_equal(x, result) result1 = rearrange(x, 'a b c d e f -> f e d c b a') result2 = rearrange(x, 'f e d c b a -> a b c d e f') assert numpy.array_equal(result1, result2) result = rearrange(rearrange(x, 'a b c d e f -> (f d) c (e b) a'), '(f d) c (e b) a -> a b c d e f', b=2, d=5) assert numpy.array_equal(x, result) sizes = dict(zip('abcdef', shape)) temp = rearrange(x, 'a b c d e f -> (f d) c (e b) a', **sizes) result = rearrange(temp, '(f d) c (e b) a -> a b c d e f', **sizes) assert numpy.array_equal(x, result) x2 = numpy.arange(2 * 3 * 4).reshape([2, 3, 4]) result = rearrange(x2, 'a b c -> b c a') assert x2[1, 2, 3] == result[2, 3, 1] assert x2[0, 1, 2] == result[1, 2, 0]
def forward(self, k, q, nbhd_idx): """ Parameters ---------- query_f : torch.Tensor shape (bs, n, c_in) key_f : torch.Tensor shape (bs, n, m, c_in) Returns ------- torch.Tensor shape (bs, n, m, h) """ # (bs, m, c_in) -> (bs, m, embed_dim) -> (bs * n_heads, m, h_dim) K = rearrange(self.fc_k(k), "b n (h d) -> (b h) n d", h=self.n_heads) # (bs, n, c_in) -> (bs, n, embed_dim) -> (bs * n_heads, n, h_dim) Q = rearrange(self.fc_q(q), "b n (h d) -> (b h) n d", h=self.n_heads) # (bs * n_heads, n, h_dim), (bs * n_heads, m, h_dim) -> (bs * n_heads, n, m) A_ = Q.bmm(K.transpose(1, 2)) / math.sqrt(self.head_dim) # (bs * n_heads, n, nbhd_size) -> (bs, n, nbhd_size, n_heads) A_ = rearrange(A_, "(b h) n m -> b n m h", h=self.n_heads) # Batch indicies B = (torch.arange( A_.shape[0], device=A_.device).long()[:, None, None].expand(*nbhd_idx.shape)) # Get NNS indexes NNS = (torch.arange( A_.shape[1], device=A_.device).long()[None, :, None].expand(*nbhd_idx.shape)) A_ = A_[B, NNS, nbhd_idx] return A_
def train_single_epoch(epoch, model, train_loader, optimizer, criterion, device): """ Train single epoch """ for i, batch in enumerate(iter(train_loader)): img, target = batch img, target = img.to(device), target.to(device) optimizer.zero_grad() aux = torch.ones(target.shape[0], 1, dtype=int) * model.vocab.word_to_index['<PAD>'] aux = aux.to(target.device) target = torch.cat([target, aux], dim=1) target_loss = target output = model(img, target[:, :-1]) output = rearrange(output, 'bsz seq_len vocab_size -> bsz vocab_size seq_len') loss = criterion(output, target_loss[:, 1:]) if i % 100 == 0: print( '--------------------------------------------------------------------------------------------------' ) print( f'Epoch {epoch} batch: {i}/{len(train_loader)} loss: {loss.item()}' ) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.25) optimizer.step() generated_captions = torch.argmax(output.transpose(1, 2), dim=-1) expected_captions = target[..., 1:] generated_captions, expected_captions, images = generated_captions[:16, ...], expected_captions[: 16, ...], img[: 16, ...] write_on_tensorboard(epoch=len(train_loader) * (epoch - 1) + i, model=model, loss=loss.item(), images=images, expected_captions=expected_captions, generated_captions=generated_captions)
def test_concatenations_and_stacking(): for backend in imp_op_backends: print('testing shapes for ', backend.framework_name) for n_arrays in [1, 2, 5]: shapes = [[], [1], [1, 1], [2, 3, 5, 7], [1] * 6] for shape in shapes: if backend.framework_name == 'mxnet.ndarray' and len( shape) == 0: # known bug of mxnet continue arrays1 = [ numpy.arange(i, i + numpy.prod(shape)).reshape(shape) for i in range(n_arrays) ] arrays2 = [backend.from_numpy(array) for array in arrays1] result0 = numpy.asarray(arrays1) result1 = rearrange(arrays1, '...->...') result2 = rearrange(arrays2, '...->...') assert numpy.array_equal(result0, result1) assert numpy.array_equal(result1, backend.to_numpy(result2)) result1 = rearrange(arrays1, 'b ... -> ... b') result2 = rearrange(arrays2, 'b ... -> ... b') assert numpy.array_equal(result1, backend.to_numpy(result2))
def forward(self, pairwise_locations, mask, query_features, key_features, nbhd_idx): # (bs, m, c_in) -> (bs, m, embed_dim) -> (bs * n_heads, m, h_dim) K = rearrange(self.W_k(key_features), "b n (h d) -> (b h) n d", h=self.n_heads) # (bs, n, c_in) -> (bs, n, embed_dim) -> (bs * n_heads, n, h_dim) Q = rearrange(self.W_q(query_features), "b n (h d) -> (b h) n d", h=self.n_heads) e = rearrange(self.W_l(pairwise_locations), "b n m (h d) -> (b h) n m d", h=self.n_heads) u = self.u.repeat([mask.shape[0], 1, 1]) v = self.v.repeat([mask.shape[0], 1, 1]) nbhd_idx = nbhd_idx.repeat_interleave(self.n_heads, dim=0) # Get NNS indexes NNS = (torch.arange( nbhd_idx.shape[1], device=nbhd_idx.device)[None, :, None].long().expand(*nbhd_idx.shape)) # Batch indicies B = (torch.arange( nbhd_idx.shape[0], device=nbhd_idx.device)[:, None, None].long().expand(*nbhd_idx.shape)) A_ = (Q.bmm(K.transpose(1, 2))[B, NNS, nbhd_idx] + self.lamda * (e @ (Q + v).unsqueeze(-1)).squeeze() + (u @ K.transpose(1, 2))[B, 0, nbhd_idx]) / math.sqrt( self.head_dim) return rearrange(A_, "(b h) n m -> b n m h", h=self.n_heads)
def forward(self, img: FloatTensor, img_mask: LongTensor) -> Tuple[FloatTensor, LongTensor]: """encode image to feature Parameters ---------- img : FloatTensor [b, 1, h', w'] img_mask: LongTensor [b, h', w'] Returns ------- Tuple[FloatTensor, LongTensor] [b, t, d], [b, t] """ feature, mask = self.model(img, img_mask) feature = self.feature_proj(feature) feature = self.pos_enc_2d(feature, mask) feature = rearrange(feature, "b d h w -> b (h w) d") mask = rearrange(mask, "b h w -> b (h w)") return feature, mask
def evaluate_tr(model, test_loader, device, epoch,criterion): model.eval() total_loss = 0. with torch.no_grad(): for idx, batch in enumerate(iter(test_loader)): img, target = batch img = img.to(device) target = target.to(device) aux=torch.ones(target.shape[0],1,dtype=int)*model.vocab.word_to_index['<PAD>'] aux=aux.to(target.device) target=torch.cat([target,aux],dim=1) target_loss=target output = model(img, target[:,:-1]) output = rearrange( output, 'bsz seq_len vocab_size -> bsz vocab_size seq_len' ) total_loss = criterion(output, target_loss[:,1:]) if idx % 10 == 0: sentence = [] num_img=random.randint(0,img.shape[0]-1) sentence = model.generate(image=img[num_img].unsqueeze(0)) reference=model.vocab.generate_caption(target[num_img,1:]) print(f'Evaluating batch {idx} / {len(test_loader)}...') print(f'Gen example (no teacher_forcing): {sentence}') print(f'Exp example: {reference}') # string=str(num_img)+'_epoch_'+str(epoch)+'_plot.png' # string_att=str(num_img)+'_epoch_'+str(epoch)+'_plot_att.png' #Visualization.show_image(img[num_img],title=example,fn=string) generated_captions = torch.argmax(output.transpose(1, 2), dim=-1) expected_captions = target[...,1:] generated_captions, expected_captions = generated_captions[:16,...], expected_captions[:16,...] write_on_tensorboard_evaluate(model= model,epoch=len(test_loader)*(epoch-1)+idx,loss=total_loss,expected_captions=expected_captions, generated_captions=generated_captions) #total_loss += corpus_bleu(target_s,sentences,(1.0/1.0,)) return total_loss
def test_parse_shape_symbolic(self, shape): print('special shape parsing for', self.backend.framework_name) if self.backend.framework_name in ['mxnet.symbol']: # mxnet can't normally run inference shape = [10, 20, 30, 40] input_symbol = self.backend.create_symbol(shape) shape_placeholder = parse_shape(input_symbol, 'a b c d') shape = {} for name, symbol in shape_placeholder.items(): shape[name] = symbol if isinstance(symbol, int) \ else self.backend.eval_symbol(symbol, [(input_symbol, numpy.zeros([10, 20, 30, 40]))]) print(shape) result_placeholder = rearrange(input_symbol, 'a b (c1 c2) (d1 d2) -> (a b d1) c1 (c2 d2)', **parse_shape(input_symbol, 'a b c1 _'), d2=2) result = self.backend.eval_symbol(result_placeholder, [(input_symbol, numpy.zeros([10, 20, 30, 40]))]) print(result.shape) assert result.shape == (10 * 20 * 20, 30, 1 * 2) assert numpy.allclose(result, 0)
def train_single_batch(epoch, model, batch, optimizer, criterion, device): img, target = batch img, target = img.to(device), target.to(device) optimizer.zero_grad() target[target == 2] = 0 output = model(img, target[:, :-1]) output = rearrange(output, 'bsz seq_len vocab_size -> bsz vocab_size seq_len') loss = criterion(output, target[..., 1:]) print( '--------------------------------------------------------------------------------------------------' ) print(f'Loss: {loss.item()}') loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.25, error_if_nonfinite=True) optimizer.step() if not epoch % 2: generated_captions = torch.argmax(output.transpose(1, 2), dim=-1) expected_captions = target[..., 1:] generated_captions, expected_captions, images = generated_captions[:16, ...], expected_captions[: 16, ...], img[: 16, ...] write_on_tensorboard(epoch=epoch, model=model, loss=loss.item(), images=images, expected_captions=expected_captions, generated_captions=generated_captions)
def operation(x): if reduction == 'rearrange': return rearrange(x, pattern, **axes_lengths) else: return reduce(x, pattern, reduction, **axes_lengths)
def forward(self, head_coarse, head_fine, x1, x2, kpts, scores, img_size_1, img_size_2, valid_size_1, valid_size_2, intrinsics1, intrinsics2, extrinsics1, extrinsics2): # select fpn levels x1_f, x1_c = self._get_level(x1) x2_f, x2_c = self._get_level(x2) data = { 'B': x1[0].size(0), 'img_size_1': img_size_1, 'img_size_2': img_size_2, 'coarse_size_1': x1_c.shape[2:], 'coarse_size_2': x2_c.shape[2:], 'fine_size_1': x1_f.shape[2:], 'fine_size_2': x2_f.shape[2:], 'dim_c': head_coarse.embedding_size, 'dim_f': head_fine.embedding_size, 'win_size': self.win_size } try: # 1. sparse annotation generation if kpts.all_none: raise Empty with torch.no_grad(): # Compute F and P matrices P, E, F = self.get_transformation(intrinsics1, intrinsics2, extrinsics1, extrinsics2) # Keypoint Generator kpts = self.generator(kpts, scores, valid_size_2, intrinsics1, intrinsics2, F, P) if kpts.all_none: raise Empty # Get contiguious view and valid transformations kpts, kpts_idx = kpts.contiguous # 2. coarse-level module # add featmap with positional encoding, then flatten it to sequence [N, HW, C] desc_c1 = rearrange(self.pos_encoding(x1_c), 'n c h w -> n (h w) c') desc_c2 = rearrange(self.pos_encoding(x2_c), 'n c h w -> n (h w) c') mask_c0 = mask_c1 = None # mask is useful in training # if 'mask0' in data: # mask_c0, mask_c1 = data['mask0'].flatten(-2), data['mask1'].flatten(-2) # Run coarse head set_active_group(head_coarse, active_group(True)) desc_c1, desc_c2 = head_coarse(desc_c1, desc_c2, mask_c0, mask_c1) # 3. match coarse-level matches = self.coarse_matching(desc_c1, desc_c2, data, mask_c0=mask_c1, mask_c1=mask_c1) # 4. coarse to fine refinement # feat_f0_unfold, feat_f1_unfold = self.fine_preprocess(feat_f0, feat_f1, feat_c0, feat_c1, data) desc_f1, desc_f2 = self.get_fine_windows_prediction( desc_c1, desc_c2, x1_f, x2_f, matches, data) # 5. fine-level module if desc_f1.size(0) != 0: # at least one coarse level predicted # Run fine head set_active_group(head_fine, active_group(True)) desc_f1, desc_f2 = head_fine(desc_f1, desc_f2) # 6. match fine-level matches = self.fine_matching(desc_f1, desc_f2, matches, data) # Compute Loss F, _ = F.contiguous P, _ = P.contiguous epipolar_loss = self.loss(kpts, F[kpts_idx], P[kpts_idx], matches, data) except Empty: active_group(False) epipolar_loss = sum(x_i.sum() for x_i in x1) * 0 return epipolar_loss