def forward(self, hidden_states, attention_mask): mixed_query_layer = self.query(hidden_states) mixed_key_layer = self.key(hidden_states) mixed_value_layer = self.value(hidden_states) query_layer = self.transpose_for_scores(mixed_query_layer) key_layer = self.transpose_for_scores(mixed_key_layer) value_layer = self.transpose_for_scores(mixed_value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = F.matmul(query_layer, transpose(key_layer, -1, -2)) attention_scores = attention_scores / math.sqrt( self.attention_head_size) # Apply the attention mask is (precomputed for all layers in BertModel forward() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = Softmax(len(attention_scores.shape) - 1)(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) context_layer = F.matmul(attention_probs, value_layer) context_layer = context_layer.transpose(0, 2, 1, 3) # using symbolic shapes to make trace happy context_shape = mge.tensor(context_layer.shape) new_context_layer_shape = F.concat( [context_shape[:-2], self.all_head_size]) context_layer = context_layer.reshape(new_context_layer_shape) return context_layer
def test_basic(): x = mge.tensor([1.0, 3.0, 5.0]).reshape(1, 3) w = mge.tensor([2.0, 4.0, 6.0]).reshape(3, 1) b = mge.tensor(-1.0) gm = GradManager().attach([w, b]) gm.record() p = F.matmul(x, w) y = p + b gm.backward(y) gm.release() # is not necessary np.testing.assert_equal(w.grad.numpy(), [[1], [3], [5]]) np.testing.assert_equal(b.grad.numpy(), [1]) w.grad = None b.grad = None with gm: p = F.matmul(x, w) y = p + b gm.backward(y) np.testing.assert_equal(w.grad.numpy(), [[1], [3], [5]]) np.testing.assert_equal(b.grad.numpy(), [1])
def test_level1_infer_shape_with_unknown(): config_async_level(2) a = mge.tensor([[1, 2, 2, 3]], dtype="float32") b = mge.tensor([1, 1]) multi2 = mge.tensor(np.array([[2, 0], [0, 2]]), dtype="float32") c = F.matmul(b, multi2) # make DepType::SHAPE unknown d = F.reshape(a, c) e = mge.tensor([[1, 2]], dtype="float32") config_async_level(1) # test src no shape, throw in level1 with pytest.raises(RuntimeError): f = F.reshape(d, b) with pytest.raises(RuntimeError): g = F.matmul(d, e) config_async_level(2)
def test_level1_infer_value(): config_async_level(1) a = mge.tensor([[1, 2], [2, 3], [3, 4]], dtype="float32") b = mge.tensor([1, 1], dtype="float32") identity = mge.tensor(np.array([[1, 0], [0, 1]]), dtype="float32") # make DepType::VALUE unknown c = F.matmul(b, identity) with pytest.raises(RuntimeError): d = F.reshape(a, c) config_async_level(2)
def forward(self, x): if not self.training or self.drop_prob <= 0.0: return x _, c, h, w = x.shape pad_h = max((self.kernel_size - 1), 0) pad_w = max((self.kernel_size - 1), 0) numel = c * h * w gamma = self.drop_prob * (w * h) / (self.kernel_size**2) / ( (w - self.kernel_size + 1) * (h - self.kernel_size + 1)) mask = mge.random.uniform(0, 1, size=(1, c, h, w)) mask[mask < gamma] = 1 mask[mask >= gamma] = 0 mask = F.max_pool2d(mask, [self.kernel_size, self.kernel_size], stride=1, padding=(pad_h // 2, pad_w // 2)) mask = 1 - mask x1 = F.expand_dims(1.0 * numel / mask.sum(axis=0), axis=0) y = F.matmul(F.matmul(x, mask), x1) return y
def forward(self, inps): x = F.matmul(inps[0], inps[1], self.param["transA"], self.param["transB"]) if self.param["alpha"] != 1.0: x = F.mul(x, self.param["alpha"]) if len(inps) == 3: if self.param["beta"] != 1.0: x = F.add(x, F.mul(inps[2], self.param["beta"])) else: x = F.add(x, inps[2]) return x
def forward(self, x, bridge): up = self.up(x) bridge = self.skip_m(bridge) out = F.concat([up, bridge], 1) if self.subnet: b_, c_, h_, w_ = bridge.shape sub = self.subnet(out) V_t = sub.reshape(b_, self.num_subspace, h_ * w_) V_t = V_t / (1e-6 + F.abs(V_t).sum(axis=2, keepdims=True)) V = V_t.transpose(0, 2, 1) mat = F.matmul(V_t, V) mat_inv = F.matinv(mat) project_mat = F.matmul(mat_inv, V_t) bridge_ = bridge.reshape(b_, c_, h_ * w_) project_feature = F.matmul(project_mat, bridge_.transpose(0, 2, 1)) bridge = F.matmul(V, project_feature).transpose(0, 2, 1).reshape( b_, c_, h_, w_) out = F.concat([up, bridge], 1) out = self.conv_block(out) return out
def test_level1_infer_shape_with_unknown(): config_async_level(2) a = mge.tensor([[1, 2, 2, 3]], dtype="float32") b = mge.tensor([1, 1]) c = b * 2 # make DepType::SHAPE unknown d = F.reshape(a, c) config_async_level(1) e = mge.tensor([[1, 2]], dtype="float32") with pytest.raises(RuntimeError): f = F.matmul(d, e)
def get_flow_mge(H_mat_mul, patch_indices, image_size_h=600, image_size_w=800): # (N, 6, 3, 3) batch_size = H_mat_mul.shape[0] divide = H_mat_mul.shape[1] H_mat_mul = mge.Tensor(H_mat_mul.reshape(batch_size, divide, 3, 3)) small_patch_sz = [image_size_h // divide, image_size_w] small = 1e-7 H_mat_pool = F.zeros((batch_size, image_size_h, image_size_w, 3, 3)) for i in range(divide): H_mat = H_mat_mul[:, i, :, :] if i == divide - 1: H_mat = F.broadcast_to(F.expand_dims(F.expand_dims(H_mat, 1), 1), (batch_size, image_size_h - i * small_patch_sz[0], image_size_w, 3, 3)) H_mat_pool[:, i * small_patch_sz[0]:, ...] = H_mat continue H_mat = F.broadcast_to(F.expand_dims(F.expand_dims( H_mat, 1), 1), (batch_size, small_patch_sz[0], image_size_w, 3, 3)) H_mat_pool[:, i * small_patch_sz[0]:(i + 1) * small_patch_sz[0], ...] = H_mat pred_I2_index_warp = F.expand_dims(patch_indices.transpose(0, 2, 3, 1), 4) pred_I2_index_warp = F.matmul(H_mat_pool, pred_I2_index_warp)[:, :, :, :, 0].transpose(0, 3, 1, 2) T_t = pred_I2_index_warp[:, 2:3, ...] smallers = 1e-6 T_t = T_t + smallers v1 = pred_I2_index_warp[:, 0:1, ...] v2 = pred_I2_index_warp[:, 1:2, ...] v1 = v1 / T_t v2 = v2 / T_t warp_index = F.concat((v1, v2), 1) vgrid = patch_indices[:, :2, ...] flow = warp_index - vgrid return flow
def forward(self, data, quad): """ data: (1, 3, 48, 160) quad: (1, 4, 2) """ N = quad.shape[0] dst = F.repeat(self.bb_out, N, axis=0).reshape(-1, 4, 2) I = F.broadcast_to(self.I, quad.shape) A = F.broadcast_to(self.A, (N, 8, 8)) A[:, 0:4, 0:2] = quad A[:, 4:8, 5:6] = I[:, :, 0:1] A[:, 0:4, 6:8] = -quad * dst[:, :, 0:1] A[:, 4:8, 3:5] = quad A[:, 0:4, 2:3] = I[:, :, 0:1] A[:, 4:8, 6:8] = -quad * dst[:, :, 1:2] B = dst.transpose(0, 2, 1).reshape(-1, 8, 1) M = F.concat([F.matmul(F.matinv(A), B)[:, :, 0], I[:, 0:1, 0]], axis=1).reshape(-1, 3, 3) new_data = F.warp_perspective(data, M, (48, 160)) # (N, 3, 48, 160) return {"data": new_data}
def calculate_score(configs, facescrub, labels, megaface): """calculate megaface identification top1 score. this evaluation implement strictly follows the description of `"The MegaFace Benchmark: 1 Million Faces for Recognition at Scale" <https://arxiv.org/pdf/1512.00596.pdf>`_ this implement outputs exactly the same as dev-sdk provided by the official, but with much higher speed Args: configs (dict): configuration facescrub (np.array): feature of facescrub labels (np.array): label of facescrub megaface (np.array): feature of megaface Returns: megaface_score (float): top1 score of megaface """ facescrub = mge.tensor(facescrub, dtype="float32") megaface = mge.tensor(megaface, dtype="float32") # note: (x - y) ** 2 = x ** 2 + y ** 2 - 2 * x * y # facescrub_score[i][j] = l2-dist(facescrub[i], facescrub[j]) facescrub_score = ( (facescrub ** 2).sum(axis=-1, keepdims=True) + (facescrub ** 2).sum(axis=-1, keepdims=True).transpose(1, 0) - 2 * F.matmul(facescrub, facescrub.transpose(1, 0)) ) facescrub_score = facescrub_score.numpy() def get_score_min_megaface(x): distr_score = (x ** 2).sum(axis=-1) + (megaface ** 2).sum(axis=-1) - 2 * (x * megaface).sum(axis=-1) return distr_score.min() up, down = 0, 0 for probe_i in tqdm(range(len(facescrub))): distr_score_min = get_score_min_megaface(facescrub[probe_i]).numpy() mask = (labels == labels[probe_i]) & (np.arange(len(facescrub)) != probe_i) for probe_j in np.where(mask)[0]: probe_score = facescrub_score[probe_i][probe_j] up += probe_score < distr_score_min down += 1 megaface_score = up / down * 100 return megaface_score
def test_dy(): x = mge.tensor([1.0, 3.0, 5.0]).reshape(1, 3) w = mge.tensor([2.0, 4.0, 6.0]).reshape(3, 1) b = mge.tensor(-1.0) gm = GradManager().attach([w, b]) def get_grad(grad, dy, idx): if isinstance(dy, (list, tuple)): return np.array(grad) * dy[idx] else: return np.array(grad) * dy # dy's shape should be the same as y's dy = mge.tensor(2.5).reshape(1, 1) w.grad = None b.grad = None with gm: p = F.matmul(x, w) y = p + b gm.backward(y, dy=dy) np.testing.assert_equal(w.grad.numpy(), [[1], [3], [5]] * dy.numpy()) np.testing.assert_equal(b.grad.numpy(), [1] * dy.numpy())
print(A + B) print(A - B) print(A * B) print(A / B) print(F.add(A, B)) print(F.sub(A, B)) print(F.mul(A, B)) print(F.div(A, B)) A = mge.tensor([[1., 2., 3.], [4., 5., 6.]]) print(A[1, :2]) A = mge.tensor([[1., 2., 3.], [4., 5., 6.]]) print(A.shape) A = A.reshape(3, 2) print(A.shape) x = mge.tensor([[1., 3., 5.], [2., 4., 6.]]) w = mge.tensor([[1., 2.], [3., 4.], [5., 6.]]) p = F.matmul(x, w) print(p)
def fwd(x, y): return F.matmul(x, y)
def fwd(data1, data2): return F.matmul(data1, data2)
def forward(self, embedding): w = F.normalize(self.weight, axis=1) x = embedding # embedding has been normalized already logits = F.matmul(x, w.transpose(1, 0)) return logits
def func(a, b): return F.matmul(a, b)
def forward(self, x): x = F.matmul(x, self.linear_weight, transpose_b=self.transpose) x = self.bn(x) return x
def forward(self, features, label=None, mask=None): """ if label and mask both None, the loss will degenerate to SimSLR unsupervised loss. Reference: "A Simple Framework for Contrastive Learning of Visual Representations"<https://arxiv.org/pdf/2002.05709.pdf> "Supervised Contrastive Learning"<https://arxiv.org/abs/2004.11362> Args: features(tensor): The embedding feature. shape=[bs, n_views, ...] label(tensor): The label of images, shape=[bs] mask(tensor): contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j has the same class as sample i. Can be asymmetric. return: loss """ if len(features.shape) < 3: raise ValueError("Features need have 3 dimensions at least") bs, num_view = features.shape[:2] #if dimension > 3, change the shape of the features to [bs, num_view, ...] if len(features.shape) > 3: features = features.reshape(bs, num_view, -1) #label and mask cannot provided at the same time if (label is not None) and (mask is not None): raise ValueError("label and mask cannot provided at the same time") elif (label is None) and (mask is None): mask = F.eye(bs, dtype="float32") elif label is not None: label = label.reshape(-1, 1) if label.shape[0] != bs: raise RuntimeError( "Num of labels does not match num of features") mask = F.equal(label, label.T) else: mask = mask.astype("float32") contrast_count = features.shape[1] features = F.split(features, features.shape[1], axis=1) contrast_feature = F.squeeze(F.concat(features, axis=0), axis=1) if self.contrast_mode == "one": anchor_feature = features[:, 0] anchor_count = 1 elif self.contrast_mode == "all": anchor_feature = contrast_feature anchor_count = contrast_count else: raise ValueError("Unknown mode:{}".format(self.contrast_mode)) #compute logits anchor_dot_contrast = F.div( F.matmul(anchor_feature, contrast_feature.T), self.temperate) #for numerical stability logits_max = F.max(anchor_dot_contrast, axis=-1, keepdims=True) logits = anchor_dot_contrast - logits_max #tile mask an1, con = mask.shape[:2] nums = anchor_count * contrast_count # mask-out self-contrast cases mask = F.stack([mask] * nums).reshape(an1 * anchor_count, con * contrast_count) logits_mask = F.scatter( F.ones_like(mask), 1, F.arange(0, int(bs * anchor_count), dtype="int32").reshape(-1, 1), F.zeros(int(bs * anchor_count), dtype="int32").reshape(-1, 1)) mask = mask * logits_mask #compute log_prob exp_logits = F.exp(logits) * logits_mask log_prob = logits - F.log(F.sum(exp_logits, axis=1, keepdims=True)) #equation 2 #mean mean_log_prob_pos = F.sum(mask * log_prob, axis=1) / F.sum(mask, axis=1) #loss loss = -(self.temperate / self.base_temperate) * mean_log_prob_pos loss = F.mean(loss.reshape(anchor_count, bs)) return loss
def f(x): return F.dot(u, F.matmul(x, v))