def construct(self, s_t_hat, encoder_outputs, encoder_feature, enc_padding_mask, coverage): b, t_k, n = encoder_outputs.shape dec_fea = self.decode_proj(s_t_hat) # (B, 2 * hidden_dim) dec_fea_expand = P.ExpandDims()(dec_fea, 1) dec_fea_expand = P.BroadcastTo()(dec_fea_expand, (b, t_k, n)) att_features = encoder_feature + dec_fea_expand if self.is_coverage: coverage_input = coverage.view(-1, 1) # (B * t_k, 1) coverage_feature = self.W_c( coverage_input) # (B * t_k, 2 * hidden_dim) att_features = att_features + coverage_feature e = P.Tanh()(att_features) # (B * t_k, 2 * hidden_dim) scores = self.v(e) # (B * t_k, 1) scores = scores.view(-1, t_k) # (B, t_k) attn_dist_ = P.Softmax(1)(scores) * enc_padding_mask # (B, t_k) normalization_factor = P.ReduceSum(True)(attn_dist_, 1) attn_dist = attn_dist_ / normalization_factor attn_dist = P.ExpandDims()(attn_dist, 1) # (B, 1, t_k) c_t = P.BatchMatMul(attn_dist, encoder_outputs) # (B, 1, n) c_t = c_t.view(-1, self.hidden_dim * 2) # (B, 2 * hidden_dim) attn_dist = attn_dist.view(-1, t_k) if self.is_coverage: coverage = coverage.view(-1, t_k) coverage = coverage + attn_dist return c_t, attn_dist, coverage
def construct(self, query, key, value): score = self.score_proj( ops.tanh(self.key_proj(key) + self.query_proj(query) + self.bias)).squeeze(-1) attn = ops.Softmax()(score) context = ops.matmul(attn.expand_dims(1), value) return context, attn
class Scalar_mix(nn.Cell): """ Computes a paramterised scalar mixture of N tensor, ```mixture = gamma * sum(s_k * tensor_k)``` where ``s = softmax(w)``, with ``w`` and ``gamma`` scalar parameters. """ def __init__(self, mixture_size: int, do_layer_norm: bool = False) -> None: super(Scalar_mix, self).__init__() self.mixture_size = mixture_size self.do_layer_norm = do_layer_norm self.scalar_parameters = ParameterTuple([Parameter(Tensor(np.array([0.0]), mindspore.float32)) \ for _ in range(mixture_size)]) self.gamma = Parameter(Tensor(np.array([0.0]), mindspore.float32)) self.sum = P.ReduceSum() self.sqrt = P.Sqrt() self.cat = P.Concat() self.unsqueeze = P.ExpandDims(0) def construct(self, tensors, mask): """ Compute a weighted average of the ``tensors`` Args: tensors: The input tensors can be any shape with at least two dimensions, but must all be the same shape. mask: When ``do_layer_norm=True``, the ``mask`` is required input. for example with ``tensors`` of shape``(batch_size, timesteps, dim)`` and ``mask`` of shape ``(batch_size, timesteps)``.dtype=mindspore.float32 """ if len(tensors) != self.mixture_size: raise ValueError("{} tensors were passed, but the module was initialized to " "mix {} tensors.".format(len(tensors), self.mixture_size)) def _do_layer_norm(tensor, broadcast_mask, num_elments_not_masked): tensor_masked = tensor * broadcast_mask mean = self.sum(tensor_masked) / num_elments_not_masked variance = self.sum(((tensor_masked - mean) * broadcast_mask) ** 2) / num_elments_not_masked return (tensor - mean) / self.sqrt(variance + 1E-12) normed_weights = P.Softmax(dim=0)(self.cat([parameter for parameter \ in self.scalar_parameters])) normed_weights = P.Split(output_num=normed_weights.shape[0])(normed_weights) # 待验证 torch.split(split=1) if not self.do_layer_norm: pieces = [] for weight, tensor in zip(normed_weights, tensors): pieces.append(weight * tensor) return self.gamma * sum(pieces) else: # mask_float = mask.float() broadcast_mask = self.unsqueeze(mask) input_dim = tensors[0].shape[-1] num_elments_not_masked = sum(mask) * input_dim pieces = [] for weight, tensor in zip(normed_weights, tensors): pieces.append(weight * _do_layer_norm(tensor, broadcast_mask, num_elments_not_masked)) return self.gamma * sum(pieces)
def rerank(args): """rerank function""" rerank_feature_file = args.rerank_feature_file rerank_result_file = args.rerank_result_file encoder_ck_file = args.rerank_encoder_ck_path downstream_ck_file = args.rerank_downstream_ck_path seed = args.seed seq_len = args.seq_len batch_size = args.rerank_batch_size random.seed(seed) np.random.seed(seed) t1 = time() generator = DataGenerator(feature_file_path=rerank_feature_file, example_file_path=None, batch_size=batch_size, seq_len=seq_len, task_type="reranker") gather_dict = defaultdict(lambda: defaultdict(list)) reranker = Reranker(batch_size=batch_size, encoder_ck_file=encoder_ck_file, downstream_ck_file=downstream_ck_file) print("start re-ranking ...") for _, batch in tqdm(enumerate(generator)): input_ids = Tensor(batch["context_idxs"], mstype.int32) attn_mask = Tensor(batch["context_mask"], mstype.int32) token_type_ids = Tensor(batch["segment_idxs"], mstype.int32) no_answer = reranker(input_ids, attn_mask, token_type_ids) no_answer_prob = ops.Softmax()(no_answer).asnumpy() no_answer_prob = no_answer_prob[:, 0] for i in range(len(batch['ids'])): qas_id = batch['ids'][i] gather_dict[qas_id][no_answer_prob[i]].append( batch['unique_ids'][i]) gather_dict[qas_id][no_answer_prob[i]].append(batch['path'][i]) rerank_result = {} for qas_id in tqdm(gather_dict, desc="get top1 path from re-rank result"): all_paths = gather_dict[qas_id] all_paths = sorted(all_paths.items(), key=lambda item: item[0]) assert qas_id not in rerank_result rerank_result[qas_id] = all_paths[0][1] with open(rerank_result_file, 'w') as f: json.dump(rerank_result, f) t2 = time() print(f"re-rank cost time: {t2-t1} s")
def __init__(self, temperature=1, hard=False, axis=-1): super().__init__() self.temperature = temperature self.hard = hard self.axis = axis self.uniform = ops.UniformReal() self.softmax = ops.Softmax(axis) self.on_value = Tensor(1.0, mindspore.float32) self.off_value = Tensor(0.0, mindspore.float32)
def gumbel_softmax(logits, temperature, hard, axis=-1, eps=1e-20): uniform_samples = ops.UniformReal()(logits.shape) gumbels = -ops.log(-ops.log(uniform_samples + eps) + eps) # ~Gumbel(0, 1) gumbels = (logits + gumbels) / temperature y_soft = ops.Softmax(axis)(gumbels) if hard: # Straight through index = y_soft.argmax(axis) y_hard = ops.OneHot(axis)(index, y_soft.shape[axis], ops.scalar_to_array(1.0), ops.scalar_to_array(0.0)) ret = ops.stop_gradient(y_hard - y_soft) + y_soft else: # Reparametrization trick. ret = y_soft return ret
def __init__(self, config, is_training, num_tokens, dropout_prob=0.0, use_one_hot_embeddings=False): super(BertPoetryModel, self).__init__() self.bert = BertModel(config, is_training, use_one_hot_embeddings) self.num_tokens = num_tokens idx = np.arange(config.seq_length) mask = idx[None, :] <= idx[:, None] self.mask = Tensor([mask], mstype.float32) self.MLM_Dense = nn.Dense(config.hidden_size, config.hidden_size,\ has_bias=True, weight_init=TruncatedNormal(0.02),\ activation='gelu').to_float(mstype.float16) self.layer_norm = nn.LayerNorm((config.hidden_size,)) self.matmul = ops.MatMul(transpose_b=True) self.biasadd = Parameter(initializer('zero', self.num_tokens), name='MLM_output_biasadd') self.softmax = ops.Softmax(axis=-1) self.seq_length = config.seq_length self.hidden_size = config.hidden_size self.cast = ops.Cast() self.reshape = ops.Reshape() self.batch_matmul = ops.BatchMatMul() ones = np.ones(shape=(config.batch_size, config.seq_length, config.seq_length)) self.lower_triangle_mask = Tensor(np.tril(ones), dtype=mstype.float32) self.multiply = ops.Mul()
def construct( self, query: ms.Tensor, key: ms.Tensor, value: ms.Tensor, attn_mask: Optional[ms.Tensor] = None, ) -> Tuple[ms.Tensor, ms.Tensor]: r""" Args: query: [batch, num_attention_heads, len_query, dim_query] key: [batch, num_attention_heads, len_key, dim_key] value: [batch, num_attention_heads, len_value, dim_value] attn_mask: [batch, num_attention_heads, len_query, len_key] """ attention = ops.matmul(query, key.transpose(0, 1, 3, 2)) attention = attention / ops.sqrt(generate_factor(query.shape[-1])) if attn_mask is not None: attention = attention + attn_mask attention = ops.Softmax(axis=-1)(attention) attention = self.dropout(attention) context = ops.matmul(attention, value) return context, attention
import mindspore as ms import mindspore.nn as nn import numpy as np import mindspore.common.initializer as weight_init import mindspore.ops as P from mindspore import Tensor from mindspore.common.initializer import Normal, Constant # net = nn.MatMul() # input_x1 = Tensor(np.ones(shape=[3, 2, 3]), ms.float32) # input_x2 = Tensor(np.ones(shape=[2, 3, 4]), ms.float32) # output = net(input_x1, input_x2) # print(output.shape) # ------------------------------------------------------------ gate = ms.Parameter(ms.Tensor(np.ones(3), dtype=ms.float64), name="w", requires_grad=True) gate.set_data(weight_init.initializer(Constant(1 / 3), gate.shape, gate.dtype)) print(gate.dtype) print("gate is ", gate) softmax = P.Softmax() gate_ = softmax(gate) print(gate_)
def read(args): """reader function""" db_file = args.wiki_db_file reader_feature_file = args.reader_feature_file reader_example_file = args.reader_example_file encoder_ck_file = args.reader_encoder_ck_file downstream_ck_file = args.reader_downstream_ck_file albert_model_path = args.albert_model_path reader_result_file = args.reader_result_file seed = args.seed sp_threshold = args.sp_threshold seq_len = args.seq_len batch_size = args.reader_batch_size para_limit = args.max_para_num sent_limit = args.max_sent_num random.seed(seed) np.random.seed(seed) t1 = time() doc_db = DocDB(db_file) generator = DataGenerator(feature_file_path=reader_feature_file, example_file_path=reader_example_file, batch_size=batch_size, seq_len=seq_len, para_limit=para_limit, sent_limit=sent_limit, task_type="reader") example_dict = generator.example_dict feature_dict = generator.feature_dict answer_dict = defaultdict(lambda: defaultdict(list)) new_answer_dict = {} total_sp_dict = defaultdict(list) new_total_sp_dict = defaultdict(list) tokenizer = AlbertTokenizer.from_pretrained(albert_model_path) new_tokens = ['[q]', '[/q]', '<t>', '</t>', '[s]'] tokenizer.add_tokens(new_tokens) reader = Reader(batch_size=batch_size, encoder_ck_file=encoder_ck_file, downstream_ck_file=downstream_ck_file) print("start reading ...") for _, batch in tqdm(enumerate(generator)): input_ids = Tensor(batch["context_idxs"], mstype.int32) attn_mask = Tensor(batch["context_mask"], mstype.int32) token_type_ids = Tensor(batch["segment_idxs"], mstype.int32) context_mask = Tensor(batch["context_mask"], mstype.float32) square_mask = Tensor(batch["square_mask"], mstype.float32) packing_mask = Tensor(batch["query_mapping"], mstype.float32) para_start_mapping = Tensor(batch["para_start_mapping"], mstype.float32) sent_end_mapping = Tensor(batch["sent_end_mapping"], mstype.float32) unique_ids = batch["unique_ids"] sent_names = batch["sent_names"] cache_mask = Tensor( np.tril(np.triu(np.ones((seq_len, seq_len)), 0), 30), mstype.float32) _, _, q_type, _, sent_logit, y1, y2 = reader( input_ids, attn_mask, token_type_ids, context_mask, square_mask, packing_mask, cache_mask, para_start_mapping, sent_end_mapping) type_prob = ops.Softmax()(q_type).asnumpy() answer_dict_ = convert_to_tokens(example_dict, feature_dict, batch['ids'], y1.asnumpy().tolist(), y2.asnumpy().tolist(), type_prob, tokenizer, sent_logit.asnumpy(), sent_names, unique_ids) for q_id in answer_dict_: answer_dict[q_id] = answer_dict_[q_id] for q_id in answer_dict: res = answer_dict[q_id] answer_text_ = res[0] sent_ = res[1] sent_names_ = res[2] new_answer_dict[q_id] = answer_text_ predict_support_np = ops.Sigmoid()(Tensor(sent_, mstype.float32)).asnumpy() for j in range(predict_support_np.shape[0]): if j >= len(sent_names_): break if predict_support_np[j] > sp_threshold: total_sp_dict[q_id].append(sent_names_[j]) for _id in total_sp_dict: _sent_names = total_sp_dict[_id] for para in _sent_names: title = make_wiki_id(para[0], 0) para_original_title = doc_db.get_doc_info(title)[-1] para[0] = para_original_title new_total_sp_dict[_id].append(para) prediction = {'answer': new_answer_dict, 'sp': new_total_sp_dict} with open(reader_result_file, 'w') as f: json.dump(prediction, f, indent=4) t2 = time() print(f"reader cost time: {t2-t1} s")
def log_softmax(input, axis=-1): return ops.log(ops.Softmax(axis)(input))
def softmax(input, axis=-1): return ops.Softmax(axis)(input)