def create_mask(self, qlen, mlen): """ Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked. Args: qlen: Sequence length mlen: Mask length :: same_length=False: same_length=True: <mlen > < qlen > <mlen > < qlen > ^ [0 0 0 0 0 1 1 1 1] [0 0 0 0 0 1 1 1 1] [0 0 0 0 0 0 1 1 1] [1 0 0 0 0 0 1 1 1] qlen [0 0 0 0 0 0 0 1 1] [1 1 0 0 0 0 0 1 1] [0 0 0 0 0 0 0 0 1] [1 1 1 0 0 0 0 0 1] v [0 0 0 0 0 0 0 0 0] [1 1 1 1 0 0 0 0 0] """ attn_mask = paddle.ones([qlen, qlen]) mask_up = paddle.triu(attn_mask, diagonal=1) attn_mask_pad = paddle.zeros([qlen, mlen]) ret = paddle.concat([attn_mask_pad, mask_up], axis=1) if self.same_length: mask_lo = paddle.tril(attn_mask, diagonal=-1) ret = paddle.concat([ret[:, :qlen] + mask_lo, ret[:, qlen:]], axis=1) return ret
def seq2feats(self, log_seqs, time_matrices): seqs = self.item_emb(log_seqs) seqs *= self.item_emb._embedding_dim**0.5 seqs = self.item_emb_dropout(seqs) positions = paddle.arange(log_seqs.shape[1]).unsqueeze(0).expand( [log_seqs.shape[0], -1]) abs_pos_K = self.abs_pos_K_emb(positions) abs_pos_V = self.abs_pos_V_emb(positions) abs_pos_K = self.abs_pos_K_emb_dropout(abs_pos_K) abs_pos_V = self.abs_pos_V_emb_dropout(abs_pos_V) time_matrix_K = self.time_matrix_K_emb(time_matrices) time_matrix_V = self.time_matrix_V_emb(time_matrices) time_matrix_K = self.time_matrix_K_dropout(time_matrix_K) time_matrix_V = self.time_matrix_V_dropout(time_matrix_V) # mask 0th items(placeholder for dry-run) in log_seqs # would be easier if 0th item could be an exception for training timeline_mask = log_seqs == 0 seqs *= (log_seqs != 0).astype(paddle.get_default_dtype()).unsqueeze( -1) # broadcast in last dim tl = seqs.shape[1] # time dim len for enforce causality attention_mask = ( paddle.tril(paddle.ones([tl, tl])) == 0).astype(paddle.bool) for i in range(len(self.attention_layers)): # Self-attention, Q=layernorm(seqs), K=V=seqs # seqs = paddle.transpose(seqs, 0, 1) # (N, T, C) -> (T, N, C) Q = self.attention_layernorms[i](seqs) mha_outputs = self.attention_layers[i]( Q, seqs, timeline_mask, attention_mask, time_matrix_K, time_matrix_V, abs_pos_K, abs_pos_V) seqs = Q + mha_outputs # seqs = paddle.transpose(seqs, 0, 1) # (T, N, C) -> (N, T, C) # Point-wise Feed-forward, actually 2 Conv1D for channel wise fusion seqs = self.forward_layernorms[i](seqs) seqs = self.forward_layers[i](seqs) seqs *= (timeline_mask.astype(int) == 0 ).astype(paddle.get_default_dtype()).unsqueeze(-1) log_feats = self.last_layernorm(seqs) return log_feats
def _rel_shift(self, x, zero_triu=False): x_shape = x.shape zero_pad = paddle.zeros( [x_shape[0], x_shape[1], x_shape[2], 1], dtype=x.dtype) x_padded = paddle.concat([zero_pad, x], axis=-1) x_padded = paddle.reshape( x_padded, shape=[x_shape[0], x_shape[1], x_shape[3] + 1, x_shape[2]]) x = paddle.reshape(x_padded[:, :, 1:, :], shape=x_shape) if zero_triu: ones = paddle.ones([x_shape[2], x_shape[3]]) x = x * paddle.tril( ones, diagonal=x_shape[3] - x_shape[2]).unsqueeze([2, 3]) return x
def future_mask(time_steps, dtype="bool"): """Generate lower triangular mask. It is used at transformer decoder to prevent the decoder to see future information. Parameters ---------- time_steps : int Decoder time steps. dtype : str, optional The data type of the generate mask, by default "bool". Returns ------- Tensor The generated mask. """ mask = paddle.tril(paddle.ones([time_steps, time_steps])) return paddle.cast(mask, dtype)
def forward(self, x, kv_cache=None): self.seq_len = x.shape[1] x = self.query_key_value(x) q, k, v = x.split(num_or_sections=3, axis=2) q = self.split_heads(q) k = self.split_heads(k) v = self.split_heads(v) if kv_cache is not None: pk, pv = paddle.unstack(kv_cache, axis=1) k = paddle.concat([pk, k], axis=-2) v = paddle.concat([pv, v], axis=-2) cached_kv = paddle.stack([k, v], axis=1) attn = paddle.matmul(q, k, transpose_y=True) # [B, N, L, S] attn = attn / math.sqrt(self.size_per_head) # [L, S] attention_mask = paddle.tril( paddle.ones([self.seq_len, self.seq_len], 'float32')) attention_mask = attention_mask.reshape( [1, 1, self.seq_len, self.seq_len]) # adding to softmax -> its like removing them entirely attn = attn * attention_mask - 10000.0 * (1.0 - attention_mask) attn = nn.Softmax(axis=-1)(attn) attn = self.attn_drop(attn) y = paddle.matmul(attn, v) # [B, N, L, S] -> [B, L, N, S] y = y.transpose((0, 2, 1, 3)) y = paddle.reshape(y, [-1, self.seq_len, self.embedding_size]) y = self.resid_drop(self.dense(y)) return y, cached_kv
def _forward(self, dec_inputs, mems=None): bsz, qlen = dec_inputs.shape word_emb = self.word_emb(dec_inputs) mlen = mems[0].shape[1] if mems is not None else 0 klen = mlen + qlen if self.same_length: all_ones = paddle.ones(shape=[qlen, klen], dtype=word_emb.dtype) mask_len = klen - self.mem_len if mask_len > 0: mask_shift_len = qlen - mask_len else: mask_shift_len = qlen dec_attn_mask = (paddle.triu( all_ones, diagonal=1 + mlen) + paddle.tril( all_ones, -mask_shift_len)).unsqueeze([0, 1]) else: dec_attn_mask = paddle.ones( shape=[qlen, klen], dtype=word_emb.dtype) dec_attn_mask = paddle.triu( dec_attn_mask, diagonal=1 + mlen).unsqueeze([0, 1]) hids = [] if self.attn_type == 0: pos_seq = paddle.arange(klen - 1, -1, -1.0, dtype=word_emb.dtype) if self.clamp_len > 0: # TODO: clamp and clip pos_seq = paddle.clip(pos_seq, max=self.clamp_len) pos_emb = self.pos_emb(pos_seq, bsz) core_out = self.drop(word_emb) pos_emb = self.drop(pos_emb) hids.append(core_out) for i, layer in enumerate(self.layers): mems_i = None if mems is None else mems[i] core_out = layer( core_out, pos_emb, self.r_w_bias, self.r_r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i) hids.append(core_out) elif self.attn_type == 1: core_out = self.drop(word_emb) hids.append(core_out) for i, layer in enumerate(self.layers): if self.clamp_len > 0: r_emb = self.r_emb[i][-self.clamp_len:] r_bias = self.r_bias[i][-self.clamp_len:] else: r_emb, r_bias = self.r_emb[i], self.r_bias[i] mems_i = None if mems is None else mems[i] core_out = layer( core_out, r_emb, self.r_w_bias[i], r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i) hids.append(core_out) elif self.attn_type == 2: pos_seq = paddle.arange(klen - 1, -1, -1.0, dtype=word_emb.dtype) if self.clamp_len > 0: pos_seq = paddle.clip(pos_seq, max=self.clamp_len) pos_emb = self.pos_emb(pos_seq, bsz) core_out = self.drop(word_emb + pos_emb[-qlen:]) hids.append(core_out) for i, layer in enumerate(self.layers): mems_i = None if mems is None else mems[i] if mems_i is not None and i == 0: mems_i += pos_emb[:mlen] core_out = layer( core_out, dec_attn_mask=dec_attn_mask, mems=mems_i) hids.append(core_out) elif self.attn_type == 3: core_out = self.drop(word_emb) hids.append(core_out) for i, layer in enumerate(self.layers): mems_i = None if mems is None else mems[i] if mems_i is not None and mlen > 0: cur_emb = self.r_emb[i][:-qlen] cur_size = cur_emb.size(0) if cur_size < mlen: cur_emb_pad = cur_emb[0:1].expand(mlen - cur_size, -1, -1) cur_emb = paddle.concat([cur_emb_pad, cur_emb], 0) else: cur_emb = cur_emb[-mlen:] mems_i += cur_emb.view(mlen, 1, -1) core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1) core_out = layer( core_out, dec_attn_mask=dec_attn_mask, mems=mems_i) hids.append(core_out) core_out = self.drop(core_out) new_mems = self._update_mems(hids, mems, mlen, qlen) return core_out, new_mems
def deep_match(item_his_eb, context_his_eb, mask, match_mask, mid_his_batch, item_vectors, item_biases, n_mid): query = context_his_eb query = self.query_layer( query) # [-1, self.history_length, self.main_embedding_size*2] query = self.query_prelu(query) inputs = paddle.concat( [ query, item_his_eb, query - item_his_eb, query * item_his_eb ], axis=-1) # B,T,E att_layer1 = self.att_layer1_layer(inputs) att_layer1 = F.sigmoid(att_layer1) att_layer2 = self.att_layer2_layer(att_layer1) att_layer2 = F.sigmoid(att_layer2) att_layer3 = self.att_layer3_layer(att_layer2) # B,T,1 scores = paddle.transpose(att_layer3, [0, 2, 1]) # B,1,T # mask bool_mask = paddle.equal(mask, paddle.ones_like(mask)) # B,T key_masks = paddle.unsqueeze(bool_mask, axis=1) # B,1,T paddings = paddle.ones_like(scores) * (-2**32 + 1) scores = paddle.where(key_masks, scores, paddings) # tril scores_tile = paddle.tile( paddle.sum(scores, axis=1), [1, paddle.shape(scores)[-1]]) # B, T*T scores_tile = paddle.reshape(scores_tile, [ -1, paddle.shape(scores)[-1], paddle.shape(scores)[-1] ]) # B, T, T diag_vals = paddle.ones_like(scores_tile) # B, T, T tril = paddle.tril(diag_vals) paddings = paddle.ones_like(tril) * (-2**32 + 1) scores_tile = paddle.where( paddle.equal(tril, paddle.full([1], 0.0, "float32")), paddings, scores_tile) # B, T, T scores_tile = F.softmax(scores_tile) # B, T, T att_dm_item_his_eb = paddle.matmul(scores_tile, item_his_eb) # B, T, E dnn_layer1 = self.dnn_layer1_layer(att_dm_item_his_eb) dnn_layer1 = dnn_layer1.reshape( [-1, self.history_length, self.main_embedding_size]) ## dnn_layer1 = self.dnn_layer1_prelu(dnn_layer1) # target mask user_vector = dnn_layer1[:, -1, :] # B, E user_vector2 = dnn_layer1[:, -2, :] * paddle.reshape( match_mask, [-1, paddle.shape(match_mask)[1], 1])[:, -2, :] # B, E num_sampled = 2000 labels = paddle.reshape(mid_his_batch[:, -1], [-1, 1]) # B, 1 # not sample, slow # [B, E] * [E_size, cate_size] logits = paddle.matmul( user_vector2, item_vectors, transpose_y=True) logits = paddle.add(logits, item_biases) loss = F.cross_entropy(input=logits, label=labels) return loss, user_vector, scores
# [N, G] 是否是gt。 is_gt = np.array([[1, 1, 0], [1, 1, 1]]).astype(np.float32) is_in_boxes_or_center = np.array([[3, 100, 103, 2, 109], [3, 100, 103, 2, 109]]).astype(np.float32) cost = paddle.to_tensor(cost) dynamic_ks = paddle.to_tensor(dynamic_ks) is_gt = paddle.to_tensor(is_gt) max_dynamic_ks = dynamic_ks.max(-1) # [N, ] 每张图片所有gt的dynamic_ks的最大值 max_k = max_dynamic_ks.max() # [1, ] 所有图片所有gt的dynamic_ks的最大值 # 下三角全是1的矩阵 topk_mask = paddle.ones((max_k, max_k), 'float32') # [max_k, max_k] topk_mask = paddle.tril(topk_mask, diagonal=0) # [max_k, max_k] fill_value = paddle.gather(topk_mask, dynamic_ks.reshape( (-1, )) - 1) # [N*G, max_k] 填入matching_matrix fill_value *= is_gt.reshape((-1, 1)) # [N*G, max_k] 还要处理假gt,假gt处全部填0 fill_value = fill_value.reshape((-1, )) # [N*G*max_k, ] 填入matching_matrix # 不放心的话,再次将假gt的cost增大 cost += (1.0 - is_gt.unsqueeze(2)) * 100000.0 min_cost, min_cost_index = paddle.topk(cost, k=max_k, axis=2, largest=False, sorted=True) matching_matrix = paddle.zeros([ N * G * A,
def dynamic_k_matching(self, cost, pair_wise_ious, gt_classes, N, G, A, is_in_boxes_or_center, is_gt): # Dynamic K # --------------------------------------------------------------- # cost.shape = [N, G, A] 每张图片 所有gt 和 所有预测框 两两之间的cost。 # pair_wise_ious.shape = [N, G, A] 每张图片 所有gt 和 所有预测框 两两之间的iou。 # gt_classes.shape = [N*G, ] 每张图片所有gt的类别id。 # is_in_boxes_or_center.shape = [N, A] 每个格子是否是在 任意gt内部 或 任意gt的镜像gt内部(不要求同一个gt)。候选正样本处为1。 # is_gt.shape = [N, G] 是真gt处为1。 # 4-2-5-1.每个gt应该分配给几个预测框(格子)。 # 表示最多只抽 与每个gt iou最高的10个预测框(格子)。 n_candidate_k = 10 # [N, G, n_candidate_k] 表示对于每个gt,选出前n_candidate_k个与它iou最高的预测框。 topk_ious, _ = paddle.topk(pair_wise_ious, n_candidate_k, axis=-1) # [N, G] 最匹配当前gt的前n_candidate_k个的预测框iou求和。 dynamic_ks = topk_ious.sum(-1) dynamic_ks = paddle.clip( dynamic_ks, 1.0, np.inf) # [N, G] dynamic_ks限制在区间[1.0, np.inf]内 dynamic_ks = paddle.cast(dynamic_ks, 'int32') # [N, G] 取整。表示每个gt应分配给了几个预测框。最少1个。 max_dynamic_ks = dynamic_ks.max(-1) # [N, ] 每张图片所有gt的dynamic_ks的最大值 max_k = max_dynamic_ks.max() # [1, ] 所有图片所有gt的dynamic_ks的最大值 # 4-2-5-2.根据4-2-5-1步,构造一个形状为[N, G, A]的matching_matrix, # 每个gt前dynamic_ks个cost最小的预测框处填入1,代表gt分配给了这个预测框。 # 不放心的话,再次将假gt的cost增大。因为不能用假gt确定最终正样本。 cost += (1.0 - is_gt.unsqueeze(2)) * 100000.0 # 不放心的话,再次将非候选正样本的cost增大。因为非候选正样本没有资格成为最终正样本。 cost += (1.0 - is_in_boxes_or_center.unsqueeze(1)) * 100000.0 # min_cost。 [N, G, max_k] 每个gt,取前max_k个cost最小的cost # min_cost_index。 [N, G, max_k] 每个gt,取前max_k个cost最小的cost的坐标。即哪些预测框(格子)与这个gt的cost最小。 min_cost, min_cost_index = paddle.topk(cost, k=max_k, axis=2, largest=False, sorted=True) matching_matrix = paddle.zeros([ N * G * A, ], 'float32') # [N*G*A, ] gt_ind = paddle.arange(end=N * G, dtype='int32').unsqueeze( -1) # [N*G, 1] 每个gt在matching_matrix中的下标。 min_cost_index = min_cost_index.reshape((N * G, max_k)) # [N*G, max_k] min_cost_index = gt_ind * A + min_cost_index # [N*G, max_k] min_cost_index = min_cost_index.flatten() # [N*G*max_k, ] # 下三角全是1的矩阵 topk_mask = paddle.ones((max_k, max_k), 'float32') # [max_k, max_k] topk_mask = paddle.tril(topk_mask, diagonal=0) # [max_k, max_k] fill_value = paddle.gather(topk_mask, dynamic_ks.reshape((-1, )) - 1) # [N*G, max_k] 填入matching_matrix fill_value *= is_gt.reshape((-1, 1)) # [N*G, max_k] 还要处理假gt,假gt处全部填0 fill_value = fill_value.reshape( (-1, )) # [N*G*max_k, ] 填入matching_matrix # 填入matching_matrix matching_matrix = paddle.scatter(matching_matrix, min_cost_index, fill_value, overwrite=True) matching_matrix = matching_matrix.reshape((N, G, A)) # [N, G, A] # 4-2-5-3.如果有预测框anchor(花心大萝卜)匹配到了1个以上的gt时,做特殊处理。 # 因为不可能让1个预测框学习多个gt,它只有85位信息,做不到;做法是让预测框学习与其具有最小cost的gt。 # [N, A] 每个预测框(格子)匹配到了几个gt? anchor_matching_gt = matching_matrix.sum(1) # 如果有预测框(花心大萝卜)匹配到了1个以上的gt时,做特殊处理。 if paddle.cast(anchor_matching_gt > 1, 'float32').sum() > 0: # 首先,找到与花心大萝卜具有最小cost的gt。 # 找到 花心大萝卜 的下标(这是在anchor_matching_gt.shape[N, A]中的下标)。假设有R个花心大萝卜。 index = paddle.nonzero( anchor_matching_gt > 1) # [R, 2] 每个花心大萝卜2个坐标。第0个坐标表示第几张图片,第1个坐标表示第几个格子。 cost_t = cost.transpose( (0, 2, 1)) # [N, G, A] -> [N, A, G] 转置好提取其cost cost2 = paddle.gather_nd( cost_t, index) # [R, G] 抽出 R个花心大萝卜 与 gt 两两之间的cost。 cost2 = cost2.transpose((1, 0)) # [G, R] gt 与 R个花心大萝卜 两两之间的cost。 cost_argmin = cost2.argmin( axis=0) # [R, ] 为 每个花心大萝卜 找到 与其cost最小的gt 的下标 # 准备one_hot one_hots = F.one_hot(cost_argmin, num_classes=G) # [R, G] # 花心大萝卜 处 填入one_hot matching_matrix = matching_matrix.transpose( (0, 2, 1)) # [N, G, A] -> [N, A, G] 转置好以让scatter()填入 matching_matrix = matching_matrix.reshape( (N * A, G)) # [N*A, G] reshape好以让scatter()填入 index = index[:, 0] * A + index[:, 1] matching_matrix = paddle.scatter( matching_matrix, index, one_hots, overwrite=True) # [N*A, G] scatter()填入 # matching_matrix变回原来的形状 matching_matrix = matching_matrix.reshape((N, A, G)) # [N, A, G] matching_matrix = matching_matrix.transpose( (0, 2, 1)) # [N, A, G] -> [N, G, A] # 4-2-5-4.收尾工作,准备监督信息以计算损失。 # 第一步,准备 置信度obj-ness 需要的监督信息。 # [N, A] 是否是前景(最终正样本) fg_mask = matching_matrix.sum(1) > 0.0 # [N, A] fg_mask = paddle.cast( fg_mask, 'float32') # [N, A] fg_mask作用是监督置信度,计算置信度损失。是最终正样本处为1。 num_fg = fg_mask.sum() # 所有图片前景个数 # 第二步,准备 各类别概率 需要的监督信息。确定最终正样本需要学习的类别id。 # 最终正样本在fg_mask.shape=[N, A]中的坐标 pos_index = paddle.nonzero(fg_mask > 0) # [num_fg, 2] image_id = pos_index[:, 0] # [num_fg, ] 最终正样本是第几张图片的最终正样本。 matching_matrix_t = matching_matrix.transpose( (0, 2, 1)) # [N, G, A] -> [N, A, G] 转置好以便gather_nd() matched_gt_inds = paddle.gather_nd(matching_matrix_t, pos_index) # [num_fg, G] matched_gt_inds = matched_gt_inds.argmax( 1) # [num_fg, ] 最终正样本是匹配到了第几个gt(每张图片在[G, ]中的坐标) matched_gt_inds += image_id * G # [num_fg, ] 最终正样本是匹配到了第几个gt(在gt_classes.shape=[N*G, ]中的坐标) # 最终正样本需要学习的类别id gt_matched_classes = paddle.gather(gt_classes, matched_gt_inds) # [num_fg, ] # 第三步,取出最终正样本和所学gt的iou。 # [N, G, A] 所有gt 和 所有预测框 两两之间的iou。matching_matrix第1维其实最多只有1个值非0,所以变成了最终正样本和所学gt的iou。 ious = (matching_matrix * pair_wise_ious) # [N, A] 最终正样本和所学gt的iou。 ious = ious.sum(1) # [num_fg, ] 取出最终正样本和所学gt的iou。 pred_ious_this_matching = paddle.gather_nd(ious, pos_index) # 返回这些: # num_fg。 [1, ] 所有图片前景(最终正样本)个数 # gt_matched_classes。 [num_fg, ] 最终正样本需要学习的类别id # pred_ious_this_matching。 [num_fg, ] 最终正样本和所学gt的iou # matched_gt_inds。 [num_fg, ] 最终正样本是匹配到了第几个gt(在gt_classes.shape=[N*G, ]中的坐标) # fg_mask。 [N, A] 最终正样本处为1 return num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds, fg_mask