def parallel_matmul(self, lm_output, logit_weights, parallel_output, topo): if topo is not None and topo.mp_info.size > 1: input_parallel = paddle.distributed.collective._c_identity( lm_output, group=None) logits = paddle.matmul( input_parallel, logit_weights, transpose_y=True) if parallel_output: return logits return paddle.distributed.collective._c_concat(logits, group=None) else: logits = paddle.matmul(lm_output, logit_weights, transpose_y=True) return logits
def __scaled_dot_product_attention(self, q, k, v, r, t, attn_mask): q_w, q_r, q_t = q score_w = paddle.matmul(q_w, k, transpose_y=True) score_r = paddle.matmul(q_r, r, transpose_y=True) score_r = self.__rel_shift(score_r, k.shape[2]) score_t = paddle.matmul(q_t, t, transpose_y=True) score = score_w + score_r + score_t score = score * (self.d_key**-0.5) if attn_mask is not None: score += attn_mask weights = F.softmax(score) if self.dropout: weights = self.dropout(weights) out = paddle.matmul(weights, v) return out
def forward(self, graph, feat): """Forward Args: graph: hetergeneous graph built by pgl.HeterGraph. inputs: node features/representation from graph/previous layer. """ if self.num_bases < self.num_rels: weight = paddle.transpose(self.weight, perm=[1, 0, 2]) weight = paddle.matmul(self.w_comp, weight) weight = paddle.transpose(weight, perm=[1, 0, 2]) else: weight = self.weight def send_func(src_feat, dst_feat, edge_feat): """ send function """ return src_feat def recv_func(msg): """ receive function """ return msg.reduce_mean(msg['h']) feat_list = [] for idx, etype in enumerate(self.etypes): sub_g = graph[graph.edge_types[idx]] sub_g.tensor() if self.norm: norm = GF.degree_norm(sub_g) feat = feat * norm w = weight[idx, :, :].squeeze() h = paddle.matmul(feat, w) msg = sub_g.send(send_func, src_feat={'h':h}) h = sub_g.recv(recv_func, msg) feat_list.append(h) h = paddle.stack(feat_list, axis=0) h = paddle.sum(h, axis=0) if self.act == 'relu': Act = paddle.nn.ReLU() h = Act(h) else: Act = paddle.nn.Sigmoid() h = Act(h) return h
def _get_rand_mask(self, blocked_query_mask, blocked_key_mask, rand_mask_idx, batch_size, sequence_length): ''' return random mask: [B, H, L-G, bs, R * bs] ''' # rand_mask_idx: [H, T] # blocked_query_mask: [B, L, bs] # blocked_key_mask: [B, L, bs] bs = self.block_size B = batch_size L = sequence_length // bs H = self.num_heads G = self.num_global_blocks GB = self.num_global_blocks_back GF = self.num_global_blocks_front R = self.num_rand_blocks temp_block_key_mask = paddle.unsqueeze(blocked_key_mask, 1) temp_block_key_mask = paddle.expand(temp_block_key_mask, [B, H, L, -1]) temp_block_key_mask_list = [ paddle.gather_nd(temp_block_key_mask[b], rand_mask_idx) for b in range(B) ] temp_block_key_mask = paddle.concat(temp_block_key_mask_list, 0) temp_block_key_mask = paddle.reshape(temp_block_key_mask, [B, H, L - G, 1, R * bs]) temp_blocked_query_mask = paddle.unsqueeze( blocked_query_mask[:, GF:-GB], 1) temp_blocked_query_mask = paddle.expand(temp_blocked_query_mask, [B, H, L - G, -1]) temp_blocked_query_mask = paddle.reshape(temp_blocked_query_mask, [B, H, L - G, bs, 1]) rand_mask = paddle.matmul(temp_blocked_query_mask, temp_block_key_mask) return rand_mask
def forward(self, inputs): token_ids = inputs['token_ids'] type_ids = inputs['type_ids'] pos_ids = inputs['pos_ids'] generation_mask = inputs['generation_mask'] latent_id = inputs['latent_id'] data_id = inputs['data_id'] # [-1, 1, latent_type_size] latent_id = F.one_hot(latent_id, self.latent_type_size) # [-1, 1, hidden_size] latent_emb = paddle.matmul( latent_id, self.latent_weight, transpose_y=True) caches = self.plato2_encoder.gen_caches(token_ids) # [-1, seq_len + 1, hidden_size] enc_out, new_caches = self.plato2_encoder( caches, token_ids, type_ids, pos_ids, generation_mask, latent_emb) pred_ids = self.decode(inputs, new_caches) nsp_inputs = self.gen_nsp_input(token_ids, pred_ids) # [-1, 2] probs = self.nsp_predictor(nsp_inputs) return self.get_results(data_id, token_ids, pred_ids, probs)
def forward(self, inputs, encoder_word_pos, gsrm_word_pos): b, c, h, w = inputs.shape conv_features = paddle.reshape(inputs, shape=[-1, c, h * w]) conv_features = paddle.transpose(conv_features, perm=[0, 2, 1]) # transformer encoder b, t, c = conv_features.shape enc_inputs = [conv_features, encoder_word_pos, None] word_features = self.wrap_encoder_for_feature(enc_inputs) # pvam b, t, c = word_features.shape word_features = self.fc0(word_features) word_features_ = paddle.reshape(word_features, [-1, 1, t, c]) word_features_ = paddle.tile(word_features_, [1, self.max_length, 1, 1]) word_pos_feature = self.emb(gsrm_word_pos) word_pos_feature_ = paddle.reshape(word_pos_feature, [-1, self.max_length, 1, c]) word_pos_feature_ = paddle.tile(word_pos_feature_, [1, 1, t, 1]) y = word_pos_feature_ + word_features_ y = F.tanh(y) attention_weight = self.fc1(y) attention_weight = paddle.reshape(attention_weight, shape=[-1, self.max_length, t]) attention_weight = F.softmax(attention_weight, axis=-1) pvam_features = paddle.matmul(attention_weight, word_features) #[b, max_length, c] return pvam_features
def model(self, input_ids, position_ids=None, attention_mask=None, masked_positions=None, use_cache=False, cache=None): outputs = self.gpt(input_ids, position_ids=position_ids, attention_mask=attention_mask, use_cache=use_cache, cache=cache) if use_cache: encoder_outputs, cached_kvs = outputs[:2] else: encoder_outputs = outputs logits = paddle.matmul( encoder_outputs, self.gpt.embeddings.word_embeddings.weight, transpose_y=True) if use_cache: return logits, cached_kvs else: return logits
def generate_relative_positions_embeddings(self, length, depth, max_relative_position=127): vocab_size = max_relative_position * 2 + 1 range_vec = paddle.arange(length) range_mat = paddle.tile(range_vec, repeat_times=[length]).reshape( (length, length)) distance_mat = range_mat - paddle.t(range_mat) distance_mat_clipped = paddle.clip(distance_mat.astype('float32'), -max_relative_position, max_relative_position) final_mat = distance_mat_clipped + max_relative_position embeddings_table = np.zeros([vocab_size, depth]) for pos in range(vocab_size): for i in range(depth // 2): embeddings_table[pos, 2 * i] = np.sin( pos / np.power(10000, 2 * i / depth)) embeddings_table[pos, 2 * i + 1] = np.cos( pos / np.power(10000, 2 * i / depth)) embeddings_table_tensor = paddle.to_tensor(embeddings_table, dtype='float32') flat_relative_positions_matrix = final_mat.reshape((-1, )) one_hot_relative_positions_matrix = paddle.nn.functional.one_hot( flat_relative_positions_matrix.astype('int64'), num_classes=vocab_size) embeddings = paddle.matmul(one_hot_relative_positions_matrix, embeddings_table_tensor) my_shape = final_mat.shape my_shape.append(depth) embeddings = embeddings.reshape(my_shape) return embeddings
def forward(self, x): b, c, h, w = x.shape x = paddle.reshape(x, [b, c, h * w]) mu = paddle.tile(self.mu, [b, 1, 1]) with paddle.no_grad(): for i in range(self.stage_num): x_t = paddle.transpose(x, [0, 2, 1]) z = paddle.bmm(x_t, mu) z = F.softmax(z, axis=2) z_ = F.normalize(z, axis=1, p=1) mu = paddle.bmm(x, z_) mu = F.normalize(mu, axis=1, p=2) z_t = paddle.transpose(z, [0, 2, 1]) x = paddle.matmul(mu, z_t) x = paddle.reshape(x, [b, c, h, w]) if self.training: mu = paddle.mean(mu, 0, keepdim=True) if paddle.distributed.get_world_size() > 1: paddle.distributed.reduce( mu / paddle.distributed.get_world_size(), 0) mu = F.normalize(mu, axis=1, p=2) self.mu = self.mu * (1 - self.momentum) + mu * self.momentum return x
def get_active_filter(self, in_nc, out_nc, kernel_size): start, end = compute_start_end(self._kernel_size[0], kernel_size) filters = self.weight[:in_nc, :out_nc, start:end, start:end] if self.transform_kernel != False and kernel_size < self._kernel_size[ 0]: start_filter = self.weight[:in_nc, :out_nc, :, :] for i in range(len(self.ks_set) - 1, 0, -1): src_ks = self.ks_set[i] if src_ks <= kernel_size: break target_ks = self.ks_set[i - 1] start, end = compute_start_end(src_ks, target_ks) _input_filter = start_filter[:, :, start:end, start:end] _input_filter = paddle.reshape( _input_filter, shape=[(_input_filter.shape[0] * _input_filter.shape[1]), -1]) _input_filter = paddle.matmul( _input_filter, self.__getattr__('%dto%d_matrix' % (src_ks, target_ks)), False, False) _input_filter = paddle.reshape( _input_filter, shape=[ filters.shape[0], filters.shape[1], target_ks, target_ks ]) start_filter = _input_filter filters = start_filter return filters
def forward(self, input, label, init_hidden): init_h = paddle.reshape(init_hidden, shape=[self.num_layers, -1, self.hidden_size]) x_emb = self.embedding(input) x_emb = paddle.reshape(x_emb, shape=[-1, self.num_steps, self.hidden_size]) if self.dropout is not None and self.dropout > 0.0: x_emb = paddle.nn.functional.dropout(x_emb, p=self.dropout, mode='upscale_in_train') rnn_out, last_hidden = self.simple_gru_rnn(x_emb, init_h) projection = paddle.matmul(x=rnn_out, y=self.softmax_weight) projection = paddle.add(x=projection, y=self.softmax_bias) loss = paddle.nn.functional.softmax_with_cross_entropy( logits=projection, label=label, soft_label=False) pre_2d = paddle.reshape(projection, shape=[-1, self.vocab_size]) label_2d = paddle.reshape(label, shape=[-1, 1]) acc = paddle.metric.accuracy(input=pre_2d, label=label_2d, k=20) loss = paddle.reshape(loss, shape=[-1, self.num_steps]) loss = paddle.mean(loss, axis=[0]) loss = paddle.sum(loss) return loss, last_hidden, acc
def _layer_dot(inputs, node): """ dot product, e.g: [2, 1, 128] * ( expand([1, 128, 1])->[2, 128, 1] ) """ input_re = paddle.unsqueeze(inputs, axis=[2]) dot_res = paddle.matmul(node, input_re) return dot_res
def hierarchical_self_supervision(self, em, adj): def row_shuffle(embedding): return embedding[paddle.randperm(paddle.shape(embedding)[0])] def row_column_shuffle(embedding): embedding = paddle.transpose(embedding, perm=[1, 0]) corrupted_embedding = paddle.transpose(embedding[paddle.randperm( paddle.shape(embedding)[0])], perm=[1, 0]) return corrupted_embedding[paddle.randperm( paddle.shape(corrupted_embedding)[0])] def score(x1, x2): return paddle.sum(paddle.multiply(x1, x2), axis=1) user_embeddings = em edge_embeddings = paddle.matmul(adj, user_embeddings) # Local MIN pos = score(user_embeddings, edge_embeddings) neg1 = score(row_shuffle(user_embeddings), edge_embeddings) neg2 = score(row_column_shuffle(edge_embeddings), user_embeddings) local_loss = paddle.sum(-paddle.log(F.sigmoid(pos - neg1)) - paddle.log(F.sigmoid(neg1 - neg2))) # Global MIN graph = paddle.mean(edge_embeddings, axis=0) pos = score(edge_embeddings, graph) neg1 = score(row_column_shuffle(edge_embeddings), graph) global_loss = paddle.sum(-paddle.log(F.sigmoid(pos - neg1))) return global_loss + local_loss
def channel_attention(self, *channel_embeddings): """ channel_embeddings_1: (num_user, emb_size) attention_mat: (emb_size, emb_size) attention: (1, emb_size) """ weights = [] for embedding in channel_embeddings: # ((num_user, emb_size) * (emb_size, emb_size)) @ (1, emb_size) = (num_user, emb_size) @ (1, emb_size) # = (num_user, emb_size) -> (num_user, ) weights.append( paddle.sum( paddle.multiply( paddle.matmul(embedding, self.weights["attention_mat"]), self.weights["attention"]), 1)) t = paddle.stack(weights) # (num_user, channel_num) score = F.softmax(paddle.transpose(t, perm=[1, 0])) mixed_embeddings = 0.0 for i in range(len(weights)): # (emb_size, num_user) @ # (num_user, emb_size) @ (num_user, 1) -> (num_user, emb_size) mixed_embeddings += paddle.transpose(paddle.multiply( paddle.transpose(channel_embeddings[i], perm=[1, 0]), paddle.transpose(score, perm=[1, 0])[i]), perm=[1, 0]) return mixed_embeddings, score
def forward(self, input, target=None): """ anchor and positive(should include label) """ features = input["features"] reg_lambda = self.reg_lambda batch_size = features.shape[0] fea_dim = features.shape[1] num_class = batch_size // 2 #reshape out_feas = paddle.reshape(features, shape=[-1, 2, fea_dim]) anc_feas, pos_feas = paddle.split(out_feas, num_or_sections=2, axis=1) anc_feas = paddle.squeeze(anc_feas, axis=1) pos_feas = paddle.squeeze(pos_feas, axis=1) #get simi matrix similarity_matrix = paddle.matmul( anc_feas, pos_feas, transpose_y=True) #get similarity matrix sparse_labels = paddle.arange(0, num_class, dtype='int64') xentloss = paddle.nn.CrossEntropyLoss()( similarity_matrix, sparse_labels) #by default: mean #l2 norm reg = paddle.mean(paddle.sum(paddle.square(features), axis=1)) l2loss = 0.5 * reg_lambda * reg return {"npairsloss": xentloss + l2loss}
def train_iter(self, *inputs, **kwargs): img_q, img_k = inputs # compute query features q = self.encoder_q(img_q) # queries: NxC q = nn.functional.normalize(q, axis=1) # compute key features with paddle.no_grad(): # no gradient to keys self._momentum_update_key_encoder() # update the key encoder # shuffle for making use of BN im_k, idx_unshuffle = self._batch_shuffle_ddp(img_k) k = self.encoder_k(im_k) # keys: NxC k = nn.functional.normalize(k, axis=1) # undo shuffle k = self._batch_unshuffle_ddp(k, idx_unshuffle) # compute logits # FIXME: Einstein sum is more intuitive # positive logits: Nx1 l_pos = paddle.sum(q * k, axis=1).unsqueeze(-1) # negative logits: NxK l_neg = paddle.matmul(q, self.queue.clone().detach()) outputs = self.head(l_pos, l_neg) self._dequeue_and_enqueue(k) return outputs
def forward(self, x): n, c, h, w = x.shape g_x = paddle.reshape(self.g(x), [n, self.inter_channels, -1]) g_x = paddle.transpose(g_x, [0, 2, 1]) if self.mode == 'gaussian': theta_x = paddle.reshape(x, [n, self.inter_channels, -1]) theta_x = paddle.transpose(theta_x, [0, 2, 1]) if self.sub_sample: phi_x = paddle.reshape(self.phi(x), [n, self.inter_channels, -1]) else: phi_x = paddle.reshape(x, [n, self.in_channels, -1]) elif self.mode == 'concatenation': theta_x = paddle.reshape(self.theta(x), [n, self.inter_channels, -1, 1]) phi_x = self.phi(x).view(n, self.inter_channels, 1, -1) else: theta_x = paddle.reshape(self.theta(x), [n, self.inter_channels, -1, 1]) theta_x = paddle.transpose(theta_x, [0, 2, 1]) phi_x = paddle.reshape(self.phi(x), [n, self.inter_channels, -1]) pairwise_func = getattr(self, self.mode) pairwise_weight = pairwise_func(theta_x, phi_x) y = paddle.matmul(pairwise_weight, g_x) y = paddle.transpose(y, [0, 2, 1]) y = paddle.reshape(y, [n, self.inter_channels, h, w]) output = x + self.conv_out(y) return output
def forward(self, inputs): input_emb = self.embedding(inputs[0]) true_emb_w = self.embedding_w(inputs[1]) true_emb_b = self.embedding_b(inputs[1]) input_emb = paddle.squeeze(x=input_emb, axis=[1]) true_emb_w = paddle.squeeze(x=true_emb_w, axis=[1]) true_emb_b = paddle.squeeze(x=true_emb_b, axis=[1]) neg_emb_w = self.embedding_w(inputs[2]) neg_emb_b = self.embedding_b(inputs[2]) neg_emb_b_vec = paddle.reshape(neg_emb_b, shape=[-1, self.neg_num]) true_logits = paddle.add(x=paddle.sum(x=paddle.multiply(x=input_emb, y=true_emb_w), axis=1, keepdim=True), y=true_emb_b) input_emb_re = paddle.reshape(input_emb, shape=[-1, 1, self.emb_dim]) neg_matmul = paddle.matmul(input_emb_re, neg_emb_w, transpose_y=True) neg_matmul_re = paddle.reshape(neg_matmul, shape=[-1, self.neg_num]) neg_logits = paddle.add(x=neg_matmul_re, y=neg_emb_b_vec) return true_logits, neg_logits
def define_layer(self, input): x = fluid.data(name="x", shape=self.x_shape) y = fluid.data(name="y", shape=self.y_shape) self.input = x self.y = y out = paddle.matmul(x, y) self.output = out
def model(self, x, w, bias, opt): paddle.seed(0) place = paddle.CPUPlace() if paddle.device.is_compiled_with_cuda(): place = paddle.CUDAPlace(0) exe = paddle.static.Executor(place) main = paddle.static.Program() startup = paddle.static.Program() with paddle.static.program_guard(main, startup): input_x = paddle.static.data('x', x.shape, dtype=x.dtype) input_x.stop_gradient = False params_w = paddle.static.create_parameter(shape=w.shape, dtype=w.dtype, is_bias=False) params_bias = paddle.static.create_parameter(shape=bias.shape, dtype=bias.dtype, is_bias=True) y = paddle.tanh(paddle.matmul(input_x, params_w) + params_bias) loss = paddle.norm(y, p=2) opt = opt _, grads = opt.minimize(loss) if prim_enabled(): prim2orig(main.block(0)) exe.run(startup) grads = exe.run(main, feed={ 'x': x, 'w': w, 'bias': bias }, fetch_list=grads) return grads
def build_program(): main_program = paddle.static.Program() startup_program = paddle.static.Program() with paddle.static.program_guard(main_program, startup_program): with paddle.static.device_guard('cpu'): data = paddle.ones([4, 64], dtype='float32', name='data') # data -> [memcpy_h2d] -> data' -> [matmul] -> out ->[add] -> add_out with paddle.static.device_guard('gpu'): weight = paddle.randn([64, 64], name='weight') # gpu matmul_out = paddle.matmul(data, weight, name='matmul_out') # gpus bias = paddle.ones([4, 64], dtype='float32', name='bias') add_out = paddle.add(matmul_out, bias, name='add_out') # add_out -> [memcpy_d2h] -> add_out' -> [sub] -> sub_out -> [tanh] -> tanh_out with paddle.static.device_guard('cpu'): sub_out = paddle.subtract(add_out, data, name='sub_out') tanh_out = paddle.tanh(sub_out, name='tanh_out') with paddle.static.device_guard('gpu'): bias_1 = paddle.add(bias, sub_out, name='bias_1') out_before = paddle.tanh(bias_1, name='out_before') out_last = paddle.subtract(tanh_out, data, name='out_last') out = paddle.add(out_before, out_last, name='out') mean = paddle.mean(out, name='mean_out') return main_program, startup_program, [mean]
def encode_box3d(self, rotys, dims, locs): """ construct 3d bounding box for each object. Args: rotys: rotation in shape N dims: dimensions of objects locs: locations of objects Returns: """ if len(rotys.shape) == 2: rotys = rotys.flatten() if len(dims.shape) == 3: dims = paddle.reshape(dims, (-1, 3)) if len(locs.shape) == 3: locs = paddle.reshape(locs, (-1, 3)) N = rotys.shape[0] ry = self.rad_to_matrix(rotys, N) # if test: # dims.register_hook(lambda grad: print('dims grad', grad.sum())) # dims = paddle.reshape(dims, (-1, 1)).tile([1, 8]) # dims[::3, :4] = 0.5 * dims[::3, :4] # dims[1::3, :4] = 0. # dims[2::3, :4] = 0.5 * dims[2::3, :4] # dims[::3, 4:] = -0.5 * dims[::3, 4:] # dims[1::3, 4:] = -dims[1::3, 4:] # dims[2::3, 4:] = -0.5 * dims[2::3, 4:] dim_left_1 = (0.5 * dims[:, 0]).unsqueeze(-1) dim_left_2 = paddle.zeros([dims.shape[0], 1]).astype( "float32") #(paddle.zeros_like(dims[:, 1])).unsqueeze(-1) dim_left_3 = (0.5 * dims[:, 2]).unsqueeze(-1) dim_left = paddle.concat([dim_left_1, dim_left_2, dim_left_3], axis=1) dim_left = paddle.reshape(dim_left, (-1, 1)).tile([1, 4]) dim_right_1 = (-0.5 * dims[:, 0]).unsqueeze(-1) dim_right_2 = (-dims[:, 1]).unsqueeze(-1) dim_right_3 = (-0.5 * dims[:, 2]).unsqueeze(-1) dim_right = paddle.concat([dim_right_1, dim_right_2, dim_right_3], axis=1) dim_right = paddle.reshape(dim_right, (-1, 1)).tile([1, 4]) dims = paddle.concat([dim_left, dim_right], axis=1) index = paddle.to_tensor([[4, 0, 1, 2, 3, 5, 6, 7], [4, 5, 0, 1, 6, 7, 2, 3], [4, 5, 6, 0, 1, 2, 3, 7]]).tile([N, 1]) box_3d_object = gather_op(dims, 1, index) box_3d = paddle.matmul(ry, paddle.reshape(box_3d_object, (N, 3, -1))) # box_3d += locs.unsqueeze(-1).repeat(1, 1, 8) box_3d += locs.unsqueeze(-1).tile((1, 1, 8)) return box_3d
def forward(self, embedding, targets): if isinstance(embedding, dict): embedding = embedding['features'] # Normalize embedding features embedding = F.normalize(embedding, axis=1) dist_mat = paddle.matmul(embedding, embedding, transpose_y=True) N = dist_mat.shape[0] is_pos = targets.reshape([N, 1]).expand([N, N]).equal( paddle.t(targets.reshape([N, 1]).expand([N, N]))).astype('float') is_neg = targets.reshape([N, 1]).expand([N, N]).not_equal( paddle.t(targets.reshape([N, 1]).expand([N, N]))).astype('float') # Mask scores related to itself is_pos = is_pos - paddle.eye(N, N) s_p = dist_mat * is_pos s_n = dist_mat * is_neg logit_p = -self.gamma * s_p + (-99999999.) * (1 - is_pos) logit_n = self.gamma * (s_n + self.margin) + (-99999999.) * (1 - is_neg) loss = F.softplus( paddle.logsumexp(logit_p, axis=1) + paddle.logsumexp(logit_n, axis=1)).mean() return {"PairwiseCosface": loss}
def matmul(name, x1, x2, x_transpose=False, y_transpose=False): import paddle as pdpd pdpd.enable_static() with pdpd.static.program_guard(pdpd.static.Program(), pdpd.static.Program()): node_x1 = pdpd.static.data(name='x1', shape=x1.shape, dtype=x1.dtype) node_x2 = pdpd.static.data(name='x2', shape=x2.shape, dtype=x2.dtype) result = pdpd.matmul(node_x1, node_x2, x_transpose, y_transpose) #result = pdpd.static.nn.batch_norm(mul_node, use_global_stats=True) cpu = pdpd.static.cpu_places(1) exe = pdpd.static.Executor(cpu[0]) # startup program will call initializer to initialize the parameters. exe.run(pdpd.static.default_startup_program()) outs = exe.run(feed={'x1': x1, 'x2': x2}, fetch_list=[result]) saveModel(name, exe, feedkeys=['x1', 'x2'], fetchlist=[result], inputs=[x1, x2], outputs=[outs[0]], target_dir=sys.argv[1]) return outs[0]
def forward(self, query_input_ids, pos_title_input_ids, neg_title_input_ids, is_prediction=False, query_token_type_ids=None, query_position_ids=None, query_attention_mask=None, pos_title_token_type_ids=None, pos_title_position_ids=None, pos_title_attention_mask=None, neg_title_token_type_ids=None, neg_title_position_ids=None, neg_title_attention_mask=None): query_cls_embedding = self.get_pooled_embedding( query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) pos_title_cls_embedding = self.get_pooled_embedding( pos_title_input_ids, pos_title_token_type_ids, pos_title_position_ids, pos_title_attention_mask) neg_title_cls_embedding = self.get_pooled_embedding( neg_title_input_ids, neg_title_token_type_ids, neg_title_position_ids, neg_title_attention_mask) all_title_cls_embedding = paddle.concat( x=[pos_title_cls_embedding, neg_title_cls_embedding], axis=0) if is_prediction: logits = paddle.dot(query_cls_embedding, pos_title_cls_embedding) outputs = { "probs": logits, "q_rep": query_cls_embedding, "p_rep": pos_title_cls_embedding } return outputs if self.use_cross_batch: tensor_list = [] paddle.distributed.all_gather(tensor_list, all_title_cls_embedding) all_title_cls_embedding = paddle.concat(x=tensor_list, axis=0) # multiply logits = paddle.matmul(query_cls_embedding, all_title_cls_embedding, transpose_y=True) batch_size = query_cls_embedding.shape[0] labels = paddle.arange(batch_size * self.rank * 2, batch_size * (self.rank * 2 + 1), dtype='int64') labels = paddle.reshape(labels, shape=[-1, 1]) accuracy = paddle.metric.accuracy(input=logits, label=labels) loss = F.cross_entropy(input=logits, label=labels) outputs = {"loss": loss, "accuracy": accuracy} return outputs
def forward(self, x): x_shape = paddle.shape(x) x = x.flatten(2) mu = paddle.tile(self.mu, [x_shape[0], 1, 1]) with paddle.no_grad(): for i in range(self.stage_num): x_t = paddle.transpose(x, [0, 2, 1]) z = paddle.bmm(x_t, mu) z = F.softmax(z, axis=2) z_ = F.normalize(z, axis=1, p=1) mu = paddle.bmm(x, z_) mu = F.normalize(mu, axis=1, p=2) z_t = paddle.transpose(z, [0, 2, 1]) x = paddle.matmul(mu, z_t) x = paddle.reshape(x, [0, self.c, x_shape[2], x_shape[3]]) if self.training: mu = paddle.mean(mu, 0, keepdim=True) mu = F.normalize(mu, axis=1, p=2) mu = self.mu * (1 - self.momentum) + mu * self.momentum if paddle.distributed.get_world_size() > 1: mu = paddle.distributed.all_reduce(mu) mu /= paddle.distributed.get_world_size() self.mu = mu return x
def einsum4x4(equation, x, y): """ Only works for 4D x 4D. """ idx_x, idx_y, idx_z = re.split(",|->", equation) # Compute repeated index repeated_idx = list(set(idx_x + idx_y) - set(idx_z)) unique_idx_x = list(set(idx_x) - set(idx_y)) unique_idx_y = list(set(idx_y) - set(idx_x)) common_idx = list(set(idx_x) & set(idx_y) - set(repeated_idx)) new_idx_x = common_idx + unique_idx_x + repeated_idx new_idx_y = common_idx + unique_idx_y + repeated_idx new_idx_z = common_idx + unique_idx_x + unique_idx_y perm_x = [idx_x.index(i) for i in new_idx_x] perm_y = [idx_y.index(i) for i in new_idx_y] perm_z = [new_idx_z.index(i) for i in idx_z] x = paddle.transpose(x, perm=perm_x) y = paddle.transpose(y, perm=perm_y) z = paddle.matmul(x=x, y=y, transpose_y=True) z = paddle.transpose(z, perm=perm_z) return z
def forward(self, x): # NOTE: manually trigger `__iter__` logic. params = list(self.params.__iter__()) out = paddle.matmul(x, params[0]) out = paddle.add(out, params[1]) out = paddle.tanh(out) return out
def forward(self, input, label, init_hidden, init_cell): init_h = paddle.reshape(init_hidden, shape=[self.num_layers, -1, self.hidden_size]) init_c = paddle.reshape(init_cell, shape=[self.num_layers, -1, self.hidden_size]) x_emb = self.embedding(input) x_emb = paddle.reshape(x_emb, shape=[-1, self.num_steps, self.hidden_size]) if self.dropout is not None and self.dropout > 0.0: x_emb = paddle.nn.functional.dropout( x_emb, dropout_prob=self.dropout, dropout_implementation='upscale_in_train') rnn_out, last_hidden, last_cell = self.simple_lstm_rnn( x_emb, init_h, init_c) projection = paddle.matmul(x=rnn_out, y=self.softmax_weight) projection = paddle.add(x=projection, y=self.softmax_bias) loss = paddle.nn.functional.softmax_with_cross_entropy( logits=projection, label=label, soft_label=False) loss = paddle.reshape(loss, shape=[-1, self.num_steps]) loss = paddle.mean(loss, axis=[0]) loss = paddle.fluid.layers.reduce_sum(loss) return loss, last_hidden, last_cell
def gen_bias(encoder_inputs, decoder_inputs, step): decoder_bsz, decoder_seqlen = decoder_inputs.shape[:2] encoder_bsz, encoder_seqlen = encoder_inputs.shape[:2] attn_bias = paddle.reshape( paddle.arange(0, decoder_seqlen, 1, dtype='float32') + 1, [1, -1, 1]) decoder_bias = paddle.cast( (paddle.matmul(attn_bias, 1. / attn_bias, transpose_y=True) >= 1.), 'float32') #[1, decoderlen, decoderlen] encoder_bias = paddle.unsqueeze( paddle.cast(paddle.ones_like(encoder_inputs), 'float32'), [1]) #[bsz, 1, encoderlen] encoder_bias = paddle.expand(encoder_bias, [encoder_bsz, decoder_seqlen, encoder_seqlen ]) #[bsz,decoderlen, encoderlen] decoder_bias = paddle.expand(decoder_bias, [decoder_bsz, decoder_seqlen, decoder_seqlen ]) #[bsz, decoderlen, decoderlen] if step > 0: bias = paddle.concat([ encoder_bias, paddle.ones([decoder_bsz, decoder_seqlen, step], 'float32'), decoder_bias ], -1) else: bias = paddle.concat([encoder_bias, decoder_bias], -1) return bias