def record(self, loss): if isinstance(loss, flow.Tensor): self.numel += loss.shape.numel() loss = loss.sum() if self.loss_sum is None: self.loss_sum = flow.zeros_like(loss) self.loss_sum += loss elif isinstance(loss, np.ndarray): self.numel += loss.size loss = loss.sum() if self.loss_sum is None: self.loss_sum = flow.zeros_like(loss) self.loss_sum += loss elif isinstance(loss, float): self.numel += 1 if self.loss_sum is None: self.loss_sum = 0.0 self.loss_sum += loss elif isinstance(loss, int): self.numel += 1 if self.loss_sum is None: self.loss_sum = 0 self.loss_sum += loss else: raise TypeError(f"invalid loss type: {type(loss)}")
def step(self, closure: Callable = None): """Performs a single optimization step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ with flow.no_grad(): loss = None if closure is not None: loss = closure() for param_group in self.param_groups: if param_group["do_bias_correction"]: param_group["bias_correction1"] = 1.0 - math.pow( param_group["betas"][0], self._state["step"] + 1 ) param_group["bias_correction2"] = 1.0 - math.pow( param_group["betas"][1], self._state["step"] + 1 ) kwargs = { "learning_rate": param_group["lr"], "bias_correction1": param_group["bias_correction1"], "bias_correction2": param_group["bias_correction2"], "l2": param_group["weight_decay"], "beta1": param_group["betas"][0], "beta2": param_group["betas"][1], "epsilon": param_group["eps"], "do_bias_correction": param_group["do_bias_correction"], "amsgrad": param_group["amsgrad"], } for param in param_group.parameters: if param.grad is None: continue if "exp_avg" not in self._state[param]: self._state[param]["exp_avg"] = flow.zeros_like(param) if "exp_avg_sq" not in self._state[param]: self._state[param]["exp_avg_sq"] = flow.zeros_like(param) if "max_exp_avg_sq" not in self._state[param]: self._state[param]["max_exp_avg_sq"] = flow.zeros_like(param) m_tensor = self._state[param]["exp_avg"] v_tensor = self._state[param]["exp_avg_sq"] max_v_tensor = self._state[param]["max_exp_avg_sq"] flow._C.dispatch_adam_update( self._op, (param, param.grad, m_tensor, v_tensor, max_v_tensor), **kwargs, ) self._state["step"] += 1 return loss
def _test_send_recv_without_sending_meta(test_case, x0, src, dst): rank = flow.env.get_rank() if rank == src: x1 = x0 flow.comm.send(x1, dst, send_meta=False) x2 = x0 flow.comm.send(x2, dst, send_meta=False) elif rank == dst: x1 = flow.comm.recv(src, shape=x0.shape, dtype=x0.dtype, device=x0.device) test_case.assertTrue(np.array_equal(x1.numpy(), x0.numpy())) x2 = flow.zeros_like(x0) flow.comm.recv(src, shape=x0.shape, dtype=x0.dtype, device=x0.device, out=x2) test_case.assertTrue(np.array_equal(x2.numpy(), x0.numpy())) else: # do nothing pass
def step(self, closure: Callable = None): with flow.no_grad(): loss = None if closure is not None: loss = closure() for param_group in self.param_groups: lr = param_group["lr"] l2 = param_group["weight_decay"] for param in param_group.parameters: if param.grad is None: continue if param_group["momentum"] == 0.0: flow._C.dispatch_sgd_update(self._sgd, (param, param.grad), learning_rate=lr, l2=l2) else: if "momentum_buf" not in self._state[param]: self._state[param][ "momentum_buf"] = flow.zeros_like(param) momentum_buf = self._state[param]["momentum_buf"] beta = param_group["momentum"] flow._C.dispatch_momentum_update( self._momentum_sgd, (param, param.grad, momentum_buf), learning_rate=lr, l2=l2, beta=beta, ) self._state["step"] = self._state["step"] + 1 return loss
def test_discriminator( z: oft.Numpy.Placeholder((self.batch_size, 100)), images: oft.Numpy.Placeholder((self.batch_size, 1, 28, 28)), label1: oft.Numpy.Placeholder((self.batch_size, 1)), label0: oft.Numpy.Placeholder((self.batch_size, 1)), ): g_out = self.generator(z, trainable=False, const_init=True) g_logits = self.discriminator(g_out, trainable=True, const_init=True) d_loss_fake = flow.nn.sigmoid_cross_entropy_with_logits( flow.zeros_like(g_logits), g_logits, name="Dloss_fake_sigmoid_cross_entropy_with_logits", ) d_logits = self.discriminator( images, trainable=True, reuse=True, const_init=True ) d_loss_real = flow.nn.sigmoid_cross_entropy_with_logits( flow.ones_like(d_logits), d_logits, name="Dloss_real_sigmoid_cross_entropy_with_logits", ) d_loss = d_loss_fake + d_loss_real flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [self.lr]), momentum=0 ).minimize(d_loss) return d_loss
def training_step(self, batch, optimizer_idx): if optimizer_idx == 0: # generator (z,) = batch g_out = self._generator(z, trainable=True, const_init=True) g_logits = self._discriminator(g_out, trainable=False, const_init=True) g_loss = flow.nn.sigmoid_cross_entropy_with_logits( flow.ones_like(g_logits), g_logits, name="Gloss_sigmoid_cross_entropy_with_logits", ) return (g_loss, g_out) elif optimizer_idx == 1: # discriminator z, images = batch g_out = self._generator(z, trainable=False, const_init=True) g_logits = self._discriminator(g_out, trainable=True, const_init=True) d_loss_fake = flow.nn.sigmoid_cross_entropy_with_logits( flow.zeros_like(g_logits), g_logits, name="Dloss_fake_sigmoid_cross_entropy_with_logits", ) d_logits = self._discriminator( images, trainable=True, reuse=True, const_init=True ) d_loss_real = flow.nn.sigmoid_cross_entropy_with_logits( flow.ones_like(d_logits), d_logits, name="Dloss_real_sigmoid_cross_entropy_with_logits", ) d_loss = d_loss_fake + d_loss_real return d_loss
def train_discriminator(self, images, label1, label0): z = self.generate_noise() z = flow.zeros_like(z) g_out = self.generator(z) cat = flow.cat((images, g_out), dim=0) result = self.discriminator(cat) d_logits = result[: images.shape[0]] g_logits = result[images.shape[0] :] d_loss_real = self.of_cross_entropy(d_logits, label1) d_loss_fake = self.of_cross_entropy(g_logits, label0) d_loss = d_loss_fake + d_loss_real d_loss.backward() self.optimizerD.step() self.optimizerD.zero_grad() return ( to_numpy(d_loss), to_numpy(d_loss_fake), to_numpy(d_loss_real), to_numpy(d_logits), to_numpy(g_logits), )
def test_discriminator( z=flow.FixedTensorDef((self.batch_size, 100)), images=flow.FixedTensorDef((self.batch_size, 1, 28, 28)), label1=flow.FixedTensorDef((self.batch_size, 1)), label0=flow.FixedTensorDef((self.batch_size, 1)), ): g_out = self.generator(z, trainable=False, const_init=True) g_logits = self.discriminator(g_out, trainable=True, const_init=True) d_loss_fake = flow.nn.sigmoid_cross_entropy_with_logits( flow.zeros_like(g_logits), g_logits, name="Dloss_fake_sigmoid_cross_entropy_with_logits", ) d_logits = self.discriminator(images, trainable=True, reuse=True, const_init=True) d_loss_real = flow.nn.sigmoid_cross_entropy_with_logits( flow.ones_like(d_logits), d_logits, name="Dloss_real_sigmoid_cross_entropy_with_logits", ) d_loss = d_loss_fake + d_loss_real flow.losses.add_loss(d_loss) return d_loss
def reduce_variance( input_tensor: remote_blob_util.BlobDef, axis: Optional[Union[int, Sequence[int]]] = None, keepdims: bool = False, name: Optional[str] = None, ) -> remote_blob_util.BlobDef: name = _gen_unique_name_if_need(name, "ReduceVariance_") axis = _check_axis(axis, input_tensor.shape) if isinstance(axis, list) and len(axis) == 0: return flow.zeros_like(input_tensor, dtype=input_tensor.dtype, name=name + "_zeros_like") return flow.math.subtract( flow.math.reduce_mean( flow.math.square(input_tensor, name + "_square_minuend"), axis, keepdims, name + "_reduce_mean_minuend", ), flow.math.square( flow.math.reduce_mean(input_tensor, axis, keepdims, name + "_reduce_mean_subtrahend"), name + "_square_subtrahend", ), name + "_subtract", )
def reduce_std( input_tensor: oneflow._oneflow_internal.BlobDesc, axis: Optional[Union[int, Sequence[int]]] = None, keepdims: bool = False, name: Optional[str] = None, ) -> oneflow._oneflow_internal.BlobDesc: r"""This operator computes the standard deviation of input Blob along the specified axis The equation is: .. math:: out=\sqrt{\frac{1}{n}*\sum_{i=1}^{n}(x_i-mean)^2} Args: input_tensor (oneflow._oneflow_internal.BlobDesc): A Blob axis (Optional[Union[int, Sequence[int]]], optional): The dimension along which the standard deviation is computed. Defaults to None. keepdims (bool, optional): Whether to keep the reduced dimension in the output Blob. Defaults to False. name (Optional[str], optional): The name for the operation. Defaults to None. Returns: oneflow._oneflow_internal.BlobDesc: The result of standard deviation on the specified axis of input Blob For example: .. code-block:: python import oneflow as flow import numpy as np import oneflow.typing as tp @flow.global_function() def reduce_std_Job(x: tp.Numpy.Placeholder((3, 3)) ) -> tp.Numpy: return flow.math.reduce_std(x, axis=1, keepdims=True) x = np.array([[0, 5, 10], [5, 5, 5], [12, 3, 0]]).astype(np.float32) out = reduce_std_Job(x) # out [[4.0824833] # [0. ] # [5.0990195]] """ name = _gen_unique_name_if_need(name, "ReduceStd_") axis = _check_axis(axis, input_tensor.shape) if isinstance(axis, list) and len(axis) == 0: return flow.zeros_like( input_tensor, dtype=input_tensor.dtype, name=name + "_zeros_like" ) return flow.math.sqrt( flow.math.reduce_variance( input_tensor, axis, keepdims, name + "_reduce_variance" ), name + "_sqrt", )
def step(self, closure: Callable = None): """Performs a single optimization step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ with flow.no_grad(): loss = None if closure is not None: loss = closure() for param_group in self.param_groups: kwargs = { "learning_rate": param_group["lr"], "epsilon": param_group["eps"], "decay_rate": param_group["alpha"], "l2": param_group["weight_decay"], } for param in param_group.parameters: if param.grad is None: continue if "square_avg" not in self._state[param]: self._state[param]["square_avg"] = flow.zeros_like( param) ms_tensor = self._state[param]["square_avg"] if param_group["centered"]: if "grad_avg" not in self._state[param]: self._state[param]["grad_avg"] = flow.zeros_like( param) mg_tensor = self._state[param]["grad_avg"] flow._C.dispatch_rmsprop_update( self._centered_rmsprop, (param, param.grad, ms_tensor, mg_tensor), centered=True, **kwargs, ) else: flow._C.dispatch_rmsprop_update( self._rmsprop, (param, param.grad, ms_tensor), **kwargs) self._state["step"] = self._state["step"] + 1 return loss
def train_generator(self, label1): z = self.generate_noise() z = flow.zeros_like(z) g_out = self.generator(z) g_logits = self.discriminator(g_out) g_loss = self.of_cross_entropy(g_logits, label1) g_loss.backward() self.optimizerG.step() self.optimizerG.zero_grad() return (to_numpy(g_loss), to_numpy(g_out, False), to_numpy(g_logits))
def invert_permutation(permutation: Optional[Tensor]) -> Optional[Tensor]: if permutation is None: return None return flow.scatter( flow.zeros_like(permutation), 0, permutation, flow.arange(0, permutation.numel(), device=permutation.device, dtype=flow.int32), )
def _zeros_by_val(val): ret = 0 if isinstance(val, flow.Tensor): ret = flow.zeros_like(val) elif isinstance(val, np.ndarray): ret = np.zeros_like(val) elif isinstance(val, int): ret = 0 elif isinstance(val, float): ret = 0.0 else: raise ValueError return ret
def step(self, closure: Callable = None): """Performs a single optimization step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ with flow.no_grad(): loss = None if closure is not None: loss = closure() for param_group in self.param_groups: kwargs = { "learning_rate_val": param_group["lr"], "bias_correction1_val": param_group["bias_correction1"], "bias_correction2_val": param_group["bias_correction2"], "decay_rate": param_group["alpha"], "l2": param_group["weight_decay"], "beta1": param_group["betas"][0], "beta2": param_group["betas"][1], "epsilon": param_group["eps"], "do_bias_correction": param_group["do_bias_correction"], } for param in param_group.parameters: if param.grad is None: continue if "exp_avg" not in self._state[param]: self._state[param]["exp_avg"] = flow.zeros_like(param) if "exp_avg_sq" not in self._state[param]: self._state[param]["exp_avg_sq"] = flow.zeros_like( param) m_tensor = self._state[param]["exp_avg"] v_tensor = self._state[param]["exp_avg_sq"] self._lamb_op(param, param.grad, m_tensor, v_tensor) self._state["step"] = self._state["step"] + 1 return loss
def att_distill(args, student_atts, teacher_atts): att_loss = 0. teacher_layer_num = len(teacher_atts) student_layer_num = len(student_atts) assert teacher_layer_num % student_layer_num == 0 layers_per_block = int(teacher_layer_num / student_layer_num) new_teacher_atts = [ teacher_atts[i * layers_per_block + layers_per_block - 1] for i in range(student_layer_num) ] for student_att, teacher_att in zip(student_atts, new_teacher_atts): student_att = flow.where( student_att <= flow.constant(-1e2, dtype=flow.float), flow.zeros_like(student_att), student_att) teacher_att = flow.where( teacher_att <= flow.constant(-1e2, dtype=flow.float), flow.zeros_like(teacher_att), teacher_att) tmp_loss = mseloss(student_att, teacher_att) att_loss += tmp_loss return att_loss
def get_target_tensor(self, prediction, target_is_real): """Create label tensors with the same size as the input. Parameters: prediction (tensor) - - tpyically the prediction from a discriminator target_is_real (bool) - - if the ground truth label is for real images or fake images Returns: A label tensor filled with ground truth label, and with the size of the input """ if target_is_real: target_tensor = flow.ones_like(prediction) else: target_tensor = flow.zeros_like(prediction) return target_tensor
def _test_send_recv(test_case, x0, src, dst): rank = flow.env.get_rank() if rank == src: x1 = x0 flow.comm.send(x1, dst) x2 = x0 flow.comm.send(x2, dst) elif rank == dst: x1 = flow.comm.recv(src) test_case.assertTrue(np.array_equal(x1.numpy(), x0.numpy())) test_case.assertEqual(x1.device, x0.device) x2 = flow.zeros_like(x0) flow.comm.recv(src, out=x2) test_case.assertTrue(np.array_equal(x2.numpy(), x0.numpy())) test_case.assertEqual(x2.device, x0.device) else: # do nothing pass
def __init__( self, params: Union[Iterator[Parameter], List[Dict]], lr: float = 0.001, lr_decay: float = 0.0, weight_decay: float = 0, initial_accumulator_value: float = 0.0, eps: float = 1e-10, ): assert lr >= 0.0, f"Invalid learning rate: {lr}" assert weight_decay >= 0.0, f"Invalid weight_decay value: {weight_decay}" assert ( initial_accumulator_value >= 0.0 ), f"Invalid initial_accumulator_value value: {initial_accumulator_value}" assert eps >= 0.0, f"Invalid epsilon value: {eps}" options = dict() options["lr"] = lr options["initial_accumulator_value"] = initial_accumulator_value options["lr_decay"] = lr_decay options["weight_decay"] = weight_decay options["eps"] = eps super().__init__(params, options) for param_group in self.param_groups: for param in param_group.parameters: assert param.is_leaf, "parameters must be leaf tensor" self._state[param] = dict() self._state[param]["sum"] = flow.zeros_like(param).fill_( initial_accumulator_value ) self._op = ( flow.stateful_op("adagrad_update") .Input("model") .Input("model_diff") .Input("sum") .Build() )
def step(self, closure: Callable = None): """Performs a single optimization step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ with flow.no_grad(): loss = None if closure is not None: loss = closure() for param_group in self.param_groups: lr = param_group["lr"] l2 = param_group["weight_decay"] for param in param_group.parameters: if param.grad is None: continue if param_group["momentum"] == 0.0: flow._C.dispatch_sgd_update(self._sgd, (param, param.grad), learning_rate=lr, l2=l2) else: if "momentum_buf" not in self._state[param]: self._state[param][ "momentum_buf"] = flow.zeros_like(param) momentum_buf = self._state[param]["momentum_buf"] beta = param_group["momentum"] flow._C.dispatch_momentum_update( self._momentum_sgd, (param, param.grad, momentum_buf), learning_rate=lr, l2=l2, beta=beta, ) self._state["step"] = self._state["step"] + 1 return loss
def mask_finished_scores(score, flag): """ If a sequence is finished, we only allow one alive branch. This function aims to give one branch a zero score and the rest -inf score. Args: score: A real value array with shape [batch_size * beam_size, beam_size]. flag: A bool array with shape [batch_size * beam_size, 1]. Returns: A real value array with shape [batch_size * beam_size, beam_size]. """ beam_width = score.size(-1) zero_mask = flow.zeros_like(flag).to(dtype=flow.uint8) if beam_width > 1: unfinished = flow.cat( [zero_mask, flag.repeat([1, beam_width - 1])], dim=1) finished = flow.cat( (flag.to(dtype=flow.uint8), zero_mask.repeat([1, beam_width - 1])), dim=1) else: unfinished = zero_mask finished = flag.to(dtype=flow.uint8) score = flow.masked_fill(score, unfinished == 1, -float("inf")) score = flow.masked_fill(score, finished == 1, 0) return score
def shift_tokens_right(input_ids: flow.Tensor, pad_token_id: int, decoder_start_token_id: int): """ Shift input ids one token to the right. """ shifted_input_ids = flow.zeros_like(input_ids) shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() # shifted_input_ids[:, 0] = decoder_start_token_id # tensor assignment in oneflow: shifted_input_ids[:, 0] = flow.tensor( decoder_start_token_id, dtype=shifted_input_ids.dtype, device=shifted_input_ids.device, ) assert pad_token_id is not None, "self.model.pad_token_id has to be defined." # replace possible -100 values in labels by `pad_token_id` # masked shifted_input_ids = (shifted_input_ids.to(flow.float).masked_fill( shifted_input_ids.eq(-100).to(flow.int32), pad_token_id).to(flow.int32)) return shifted_input_ids
def loss_layer(self, feature_map, pred, label, bboxes, stride, prefix='loss_layer'): ''' :param feature_map: [N, H, W, 3*(5+class_num)] :param pred: [N, H, W, 3, 4+1+class_num] :param label: [N, H, W, 3, 4+1+class_num] :param bboxes: [N, V, 4] :param stride: :param anchor_per_scale: :return: giou_loss: conf_loss: prob_loss: ''' feature_map = flow.reshape( feature_map, shape=(feature_map.shape[0], feature_map.shape[1], feature_map.shape[2], self.anchor_per_scale, -1)) # shape: [N, H, W, 3, 1] raw_conf = flow.slice(feature_map, begin=[None, None, None, None, 4], size=[None, None, None, None, 1]) # shape: [N, H, W, 3, class_num] raw_prob = flow.slice( feature_map, begin=[None, None, None, None, 5], size=[None, None, None, None, feature_map.shape[-1] - 5]) # [N, H, W, 3, 4] pred_xywh = flow.slice(pred, begin=[None, None, None, None, 0], size=[None, None, None, None, 4]) pred_conf = flow.slice(pred, begin=[None, None, None, None, 4], size=[None, None, None, None, 1]) #flow.slice(label, begin=[None, None, None, None, 0], size=[None, None, None, None, 4]) label_xywh = flow.slice(label, begin=[None, None, None, None, 0], size=[None, None, None, None, 4]) respond_bbox = flow.slice(label, begin=[None, None, None, None, 4], size=[None, None, None, None, 1]) label_prob = flow.slice( label, begin=[None, None, None, None, 5], size=[None, None, None, None, label.shape[-1] - 5]) # [N, H, W, 3, 1] giou = self.bbox_giou(pred_xywh, label_xywh) # label_w = flow.slice(label, begin=[None, None, None, None, 2], size=[None, None, None, None, 1]) # label_h = flow.slice(label, begin=[None, None, None, None, 3], size=[None, None, None, None, 1]) # bbox_loss_scale = 2.0 - 1.0 * label_w * label_h / ((stride * feature_map.shape[1]) ** 2) #??? # [N, H, W, 3, 1] # giou_loss = respond_bbox * bbox_loss_scale * (1 - giou) giou_loss = respond_bbox * (1 - giou) # [N, 1, 1, 1, V, 4] bboxes_ = flow.expand_dims(bboxes, axis=1) bboxes_ = flow.expand_dims(bboxes_, axis=1) bboxes_ = flow.expand_dims(bboxes_, axis=1) # [N, H, W, 3, V] iou = self.bbox_iou(flow.expand_dims(pred_xywh, axis=-2), bboxes_) iou = flow.squeeze(iou, axis=[ -1, ]) # [N, H, W, 3, 1] max_iou = flow.math.reduce_max(iou, axis=-1, keepdims=True) # respond_bgd = (1.0 - respond_bbox) * (max_iou < self.iou_loss_thresh) tmp = flow.math.less( max_iou, flow.constant_like(like=max_iou, value=self.iou_loss_thresh, dtype=flow.float32)) # respond_bgd = (1.0 - respond_bbox) * tmp respond_bgd = flow.where( tmp, 1.0 - respond_bbox, flow.zeros_like(respond_bbox, dtype=flow.float32)) # [N, H, W, 3, 1] # ce = flow.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=raw_conf) # alpha_t = respond_bbox*self.focus_loss_alpha+(1.0-respond_bbox)*(1.0-self.focus_loss_alpha) # conf_loss = alpha_t*flow.math.pow(1.0-flow.math.exp(flow.math.negative(ce)), self.focus_loss_gamma)*ce # conf_loss = (respond_bbox+respond_bgd)*conf_loss conf_focal = self.focal(respond_bbox, pred_conf) conf_loss = conf_focal * ( respond_bbox * flow.nn.sigmoid_cross_entropy_with_logits( labels=respond_bbox, logits=raw_conf) + respond_bgd * flow.nn.sigmoid_cross_entropy_with_logits( labels=respond_bbox, logits=raw_conf)) # [N, H, W, 3, 1] prob_loss = respond_bbox * flow.nn.sigmoid_cross_entropy_with_logits( labels=label_prob, logits=raw_prob) #?? # label_w = flow.slice(label, begin=[None, None, None, None, 2], size=[None, None, None, None, 1]) # label_h = flow.slice(label, begin=[None, None, None, None, 3], size=[None, None, None, None, 1]) # bbox_loss_scale = 2.0 - 1.0 * label_w * label_h / ((stride * feature_map.shape[1]) * (stride * feature_map.shape[2])) #??? # # [N, H, W, 3, 1] # giou_loss = respond_bbox * bbox_loss_scale * flow.smooth_l1_loss(prediction=pred_xywh, label=label_xywh) giou_loss = flow.math.reduce_mean( flow.math.reduce_sum(giou_loss, axis=[1, 2, 3, 4])) conf_loss = flow.math.reduce_mean( flow.math.reduce_sum(conf_loss, axis=[1, 2, 3, 4])) prob_loss = flow.math.reduce_mean( flow.math.reduce_sum(prob_loss, axis=[1, 2, 3, 4])) return giou_loss, conf_loss, prob_loss
def inference(args): start_t = time.time() bert_module = BertForPreTraining( args.vocab_size, args.seq_length, args.hidden_size, args.num_hidden_layers, args.num_attention_heads, args.intermediate_size, nn.GELU(), args.hidden_dropout_prob, args.attention_probs_dropout_prob, args.max_position_embeddings, args.type_vocab_size, args.vocab_size, ) end_t = time.time() print("Initialize model using time: {:.3f}s".format(end_t - start_t)) start_t = time.time() if args.use_lazy_model: from utils.compare_lazy_outputs import load_params_from_lazy load_params_from_lazy( bert_module.state_dict(), args.model_path, ) else: bert_module.load_state_dict(flow.load(args.model_path)) end_t = time.time() print("Loading parameters using time: {:.3f}s".format(end_t - start_t)) bert_module.eval() bert_module.to(args.device) class BertEvalGraph(nn.Graph): def __init__(self): super().__init__() self.bert = bert_module def build(self, input_ids, input_masks, segment_ids): input_ids = input_ids.to(device=args.device) input_masks = input_masks.to(device=args.device) segment_ids = segment_ids.to(device=args.device) with flow.no_grad(): # 1. forward the next_sentence_prediction and masked_lm model _, seq_relationship_scores = self.bert(input_ids, input_masks, segment_ids) return seq_relationship_scores bert_eval_graph = BertEvalGraph() start_t = time.time() inputs = [np.random.randint(0, 20, size=args.seq_length)] inputs = flow.Tensor(inputs, dtype=flow.int64, device=flow.device(args.device)) mask = flow.cast(inputs > 0, dtype=flow.int64) segment_info = flow.zeros_like(inputs) prediction = bert_eval_graph(inputs, mask, segment_info) print(prediction.numpy()) end_t = time.time() print("Inference using time: {:.3f}".format(end_t - start_t))
def reduce_variance( input_tensor: remote_blob_util.BlobDef, axis: Optional[Union[int, Sequence[int]]] = None, keepdims: bool = False, name: Optional[str] = None, ) -> remote_blob_util.BlobDef: r"""This operator computes the variance of input Blob along the specified axis The equation is: .. math:: out=\frac{1}{n}*\sum_{i=1}^{n}(x_i-mean)^2 Args: input_tensor (remote_blob_util.BlobDef): A Blob axis (Optional[Union[int, Sequence[int]]], optional): The dimension along which the variance is computed. Defaults to None. keepdims (bool, optional): Whether to keep the reduced dimension in the output Blob. Defaults to False. name (Optional[str], optional): The name for the operation. Defaults to None. Returns: remote_blob_util.BlobDef: The result of variance on the specified axis of input Blob For example: .. code-block:: python import oneflow as flow import numpy as np import oneflow.typing as tp @flow.global_function() def reduce_variance_Job(x: tp.Numpy.Placeholder((3, 3)) ) -> tp.Numpy: return flow.math.reduce_variance(x, axis=1, keepdims=True) x = np.array([[0, 5, 10], [5, 5, 5], [12, 3, 0]]).astype(np.float32) out = reduce_variance_Job(x) # output [[16.666668] # [ 0. ] # [26. ]] """ name = _gen_unique_name_if_need(name, "ReduceVariance_") axis = _check_axis(axis, input_tensor.shape) if isinstance(axis, list) and len(axis) == 0: return flow.zeros_like(input_tensor, dtype=input_tensor.dtype, name=name + "_zeros_like") return flow.math.subtract( flow.math.reduce_mean( flow.math.square(input_tensor, name + "_square_minuend"), axis, keepdims, name + "_reduce_mean_minuend", ), flow.math.square( flow.math.reduce_mean(input_tensor, axis, keepdims, name + "_reduce_mean_subtrahend"), name + "_square_subtrahend", ), name + "_subtract", )
def multi_head_attention_forward( query: Tensor, key: Tensor, value: Tensor, embed_dim_to_check: int, num_heads: int, in_proj_weight: Tensor, in_proj_bias: Optional[Tensor], bias_k: Optional[Tensor], bias_v: Optional[Tensor], add_zero_attn: bool, dropout_p: float, out_proj_weight: Tensor, out_proj_bias: Optional[Tensor], training: bool = True, key_padding_mask: Optional[Tensor] = None, need_weights: bool = True, attn_mask: Optional[Tensor] = None, use_separate_proj_weight: bool = False, q_proj_weight: Optional[Tensor] = None, k_proj_weight: Optional[Tensor] = None, v_proj_weight: Optional[Tensor] = None, static_k: Optional[Tensor] = None, static_v: Optional[Tensor] = None, ) -> Tuple[Tensor, Optional[Tensor]]: # set up shape vars tgt_len, bsz, embed_dim = query.shape src_len, _, _ = key.shape assert ( embed_dim == embed_dim_to_check ), f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}" if isinstance(embed_dim, Tensor): # embed_dim can be a tensor when JIT tracing head_dim = embed_dim.div(num_heads) else: head_dim = embed_dim // num_heads assert (head_dim * num_heads == embed_dim ), f"embed_dim {embed_dim} not divisible by num_heads {num_heads}" if use_separate_proj_weight: # allow MHA to have different embedding dimensions when separate projection weights are used assert ( key.shape[:2] == value.shape[:2] ), f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}" else: assert ( key.shape == value.shape ), f"key shape {key.shape} does not match value shape {value.shape}" # # compute in-projection # if not use_separate_proj_weight: q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias) else: assert (q_proj_weight is not None ), "use_separate_proj_weight is True but q_proj_weight is None" assert (k_proj_weight is not None ), "use_separate_proj_weight is True but k_proj_weight is None" assert (v_proj_weight is not None ), "use_separate_proj_weight is True but v_proj_weight is None" if in_proj_bias is None: b_q = b_k = b_v = None else: b_q, b_k, b_v = in_proj_bias.chunk(3, dim=0) q, k, v = _in_projection( query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v, ) # prep attention mask if attn_mask is not None: assert ( attn_mask.dtype.is_floating_point == False ), f"Only integer type are supported for attn_mask, not {attn_mask.dtype}" # ensure attn_mask's dim is 3 if attn_mask.dim() == 2: correct_2d_size = (tgt_len, src_len) if attn_mask.shape != correct_2d_size: raise RuntimeError( f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}." ) attn_mask = attn_mask.unsqueeze(0) elif attn_mask.dim() == 3: correct_3d_size = (bsz * num_heads, tgt_len, src_len) if attn_mask.shape != correct_3d_size: raise RuntimeError( f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}." ) else: raise RuntimeError( f"attn_mask's dimension {attn_mask.dim()} is not supported") # add bias along batch dimension (currently second) if bias_k is not None and bias_v is not None: assert static_k is None, "bias cannot be added to static key." assert static_v is None, "bias cannot be added to static value." k = flow.cat([k, bias_k.repeat((1, bsz, 1))]) v = flow.cat([v, bias_v.repeat((1, bsz, 1))]) if attn_mask is not None: attn_mask = pad(attn_mask, (0, 1, 0, 0)) if key_padding_mask is not None: key_padding_mask = pad(key_padding_mask, (0, 1, 0, 0)) else: assert bias_k is None assert bias_v is None # # reshape q, k, v for multihead attention and make em batch first # # replace torch.contiguous with reshape q = q.reshape(tgt_len, bsz * num_heads, head_dim).transpose(0, 1) if static_k is None: k = k.reshape(-1, bsz * num_heads, head_dim).transpose(0, 1) else: assert ( static_k.size(0) == bsz * num_heads ), f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}" assert ( static_k.size(2) == head_dim ), f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}" k = static_k if static_v is None: v = v.reshape(-1, bsz * num_heads, head_dim).transpose(0, 1) else: assert ( static_v.size(0) == bsz * num_heads ), f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}" assert ( static_v.size(2) == head_dim ), f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}" v = static_v # add zero attention along batch dimension (now first) if add_zero_attn: zero_attn_shape = (bsz * num_heads, 1, head_dim) k = flow.cat( [k, flow.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1) v = flow.cat( [v, flow.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1) if attn_mask is not None: attn_mask = pad(attn_mask, (0, 1, 0, 0)) if key_padding_mask is not None: key_padding_mask = pad(key_padding_mask, (0, 1, 0, 0)) # update source sequence length after adjustments src_len = k.size(1) # merge key padding and attention masks if key_padding_mask is not None: assert key_padding_mask.shape == ( bsz, src_len, ), f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}" key_padding_mask = (key_padding_mask.reshape( bsz, 1, 1, src_len).expand(-1, num_heads, tgt_len, -1).reshape(bsz * num_heads, tgt_len, src_len)) if attn_mask is not None: attn_mask = attn_mask.expand(bsz * num_heads, -1, -1) if attn_mask is None: attn_mask = key_padding_mask else: attn_mask = flow.logical_or(attn_mask, key_padding_mask) # convert mask to float if attn_mask is not None and attn_mask.dtype.is_floating_point == False: new_attn_mask = flow.zeros_like(attn_mask).to(flow.float) new_attn_mask = new_attn_mask.masked_fill(attn_mask, float("-inf")) attn_mask = new_attn_mask # adjust dropout probability if not training: dropout_p = 0.0 # # (deep breath) calculate attention and out projection # attn_output, attn_output_weights = _scaled_dot_product_attention( q, k, v, attn_mask, dropout_p) attn_output = attn_output.transpose(0, 1).reshape(tgt_len, bsz, embed_dim) attn_output = linear(attn_output, out_proj_weight, out_proj_bias) if need_weights: # average attention weights over heads attn_output_weights = attn_output_weights.reshape( bsz, num_heads, tgt_len, src_len) return attn_output, attn_output_weights.sum(dim=1) / num_heads else: return attn_output, None
def train(self): # Learning rate cache for decaying. g_lr = self.g_lr d_lr = self.d_lr c_lr = self.c_lr start_iters = 0 if self.resume_iters: pass norm = Normalizer() data_iter = iter(self.data_loader) print("Start training......") start_time = datetime.now() for i in range(start_iters, self.num_iters): # Preprocess input data # Fetch real images and labels. try: x_real, speaker_idx_org, label_org = next(data_iter) except: data_iter = iter(self.data_loader) x_real, speaker_idx_org, label_org = next(data_iter) # Generate target domain labels randomly. rand_idx = flow.randperm(label_org.size(0)) label_trg = label_org[rand_idx] speaker_idx_trg = speaker_idx_org[rand_idx] x_real = x_real.to(self.device) # Original domain one-hot labels. label_org = label_org.to(self.device) # Target domain one-hot labels. label_trg = label_trg.to(self.device) speaker_idx_org = speaker_idx_org.to(self.device) speaker_idx_trg = speaker_idx_trg.to(self.device) # Train the discriminator # Compute loss with real audio frame. CELoss = nn.CrossEntropyLoss() cls_real = self.C(x_real) cls_loss_real = CELoss(input=cls_real, target=speaker_idx_org) self.reset_grad() cls_loss_real.backward() self.c_optimizer.step() # Logging. loss = {} loss["C/C_loss"] = cls_loss_real.item() out_r = self.D(x_real, label_org) # Compute loss with fake audio frame. x_fake = self.G(x_real, label_trg) out_f = self.D(x_fake.detach(), label_trg) d_loss_t = nn.BCEWithLogitsLoss()( input=out_f, target=flow.zeros_like( out_f).float()) + nn.BCEWithLogitsLoss()( input=out_r, target=flow.ones_like(out_r).float()) out_cls = self.C(x_fake) d_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg) # Compute loss for gradient penalty. alpha = flow.rand(x_real.size(0), 1, 1, 1).to(self.device) x_hat = ((alpha * x_real + (1 - alpha) * x_fake).detach().requires_grad_(True)) out_src = self.D(x_hat, label_trg) # TODO: Second-order derivation is not currently supported in oneflow, so gradient penalty cannot be used temporarily. if self.use_gradient_penalty: d_loss_gp = self.gradient_penalty(out_src, x_hat) d_loss = d_loss_t + self.lambda_cls * d_loss_cls + 5 * d_loss_gp else: d_loss = d_loss_t + self.lambda_cls * d_loss_cls self.reset_grad() d_loss.backward() self.d_optimizer.step() loss["D/D_loss"] = d_loss.item() # Train the generator if (i + 1) % self.n_critic == 0: # Original-to-target domain. x_fake = self.G(x_real, label_trg) g_out_src = self.D(x_fake, label_trg) g_loss_fake = nn.BCEWithLogitsLoss()( input=g_out_src, target=flow.ones_like(g_out_src).float()) out_cls = self.C(x_real) g_loss_cls = CELoss(input=out_cls, target=speaker_idx_org) # Target-to-original domain. x_reconst = self.G(x_fake, label_org) g_loss_rec = nn.L1Loss()(x_reconst, x_real) # Original-to-Original domain(identity). x_fake_iden = self.G(x_real, label_org) id_loss = nn.L1Loss()(x_fake_iden, x_real) # Backward and optimize. g_loss = (g_loss_fake + self.lambda_cycle * g_loss_rec + self.lambda_cls * g_loss_cls + self.lambda_identity * id_loss) self.reset_grad() g_loss.backward() self.g_optimizer.step() # Logging. loss["G/loss_fake"] = g_loss_fake.item() loss["G/loss_rec"] = g_loss_rec.item() loss["G/loss_cls"] = g_loss_cls.item() loss["G/loss_id"] = id_loss.item() loss["G/g_loss"] = g_loss.item() # Miscellaneous # Print out training information. if (i + 1) % self.log_step == 0: et = datetime.now() - start_time et = str(et)[:-7] log = "Elapsed [{}], Iteration [{}/{}]".format( et, i + 1, self.num_iters) for tag, value in loss.items(): log += ", {}: {:.4f}".format(tag, value) print(log) # Translate fixed images for debugging. if (i + 1) % self.sample_step == 0: with flow.no_grad(): d, speaker = TestSet(self.test_dir).test_data() target = random.choice( [x for x in speakers if x != speaker]) label_t = self.spk_enc.transform([target])[0] label_t = np.asarray([label_t]) for filename, content in d.items(): f0 = content["f0"] ap = content["ap"] sp_norm_pad = self.pad_coded_sp( content["coded_sp_norm"]) convert_result = [] for start_idx in range( 0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES): one_seg = sp_norm_pad[:, start_idx:start_idx + FRAMES] one_seg = flow.Tensor(one_seg).to(self.device) one_seg = one_seg.view(1, 1, one_seg.size(0), one_seg.size(1)) l = flow.Tensor(label_t) one_seg = one_seg.to(self.device) l = l.to(self.device) one_set_return = self.G(one_seg, l).detach().cpu().numpy() one_set_return = np.squeeze(one_set_return) one_set_return = norm.backward_process( one_set_return, target) convert_result.append(one_set_return) convert_con = np.concatenate(convert_result, axis=1) convert_con = convert_con[:, 0:content["coded_sp_norm"]. shape[1]] contigu = np.ascontiguousarray(convert_con.T, dtype=np.float64) decoded_sp = decode_spectral_envelope(contigu, SAMPLE_RATE, fft_size=FFTSIZE) f0_converted = norm.pitch_conversion( f0, speaker, target) wav = synthesize(f0_converted, decoded_sp, ap, SAMPLE_RATE) name = f"{speaker}-{target}_iter{i+1}_{filename}" path = os.path.join(self.sample_dir, name) print(f"[save]:{path}") sf.write(path, wav, SAMPLE_RATE) # Save model checkpoints. if (i + 1) % self.model_save_step == 0: G_path = os.path.join(self.model_save_dir, "{}-G".format(i + 1)) D_path = os.path.join(self.model_save_dir, "{}-D".format(i + 1)) C_path = os.path.join(self.model_save_dir, "{}-C".format(i + 1)) flow.save(self.G.state_dict(), G_path) flow.save(self.D.state_dict(), D_path) flow.save(self.C.state_dict(), C_path) print("Saved model checkpoints into {}...".format( self.model_save_dir)) # Decay learning rates. if (i + 1) % self.lr_update_step == 0 and (i + 1) > ( self.num_iters - self.num_iters_decay): g_lr -= self.g_lr / float(self.num_iters_decay) d_lr -= self.d_lr / float(self.num_iters_decay) c_lr -= self.c_lr / float(self.num_iters_decay) self.update_lr(g_lr, d_lr, c_lr) print("Decayed learning rates, g_lr: {}, d_lr: {}.".format( g_lr, d_lr))
def recognize(self, inputs, inputs_mask): cache = {"fronend": None, "encoder": None, "decoder": None, "lm": None} self.attn_weights = {} memory, memory_mask, _, enc_attn_weights = self.encode( inputs, inputs_mask) self.attn_weights["encoder"] = enc_attn_weights self.attn_weights["decoder"] = [] b, t, v = memory.size() beam_memory = (memory.unsqueeze(1).repeat( [1, self.beam_width, 1, 1]).view(b * self.beam_width, t, v)) beam_memory_mask = (memory_mask.unsqueeze(1).repeat( [1, self.beam_width, 1]).view(b * self.beam_width, t)) preds = (flow.ones( [b * self.beam_width, 1], dtype=flow.int64, device=memory.device) * BOS) scores = flow.tensor([0.0] + [-float("inf")] * (self.beam_width - 1), dtype=flow.float32) scores = scores.to(memory.device).repeat([b]).unsqueeze(1) ending_flag = flow.zeros_like(scores).to(dtype=flow.uint8) with flow.no_grad(): for _ in range(1, self.max_len + 1): preds, cache, scores, ending_flag = self.decode_step( preds, beam_memory, beam_memory_mask, cache, scores, ending_flag) # whether stop or not if ending_flag.sum() == b * self.beam_width: break scores = scores.view(b, self.beam_width) preds = preds.view(b, self.beam_width, -1) lengths = flow.sum(flow.ne(preds, EOS).float(), dim=-1) # length penalty if self.penalty: lp = flow.pow((self.lamda + lengths) / (self.lamda + 1), self.penalty) scores /= lp sorted_scores, offset_indices = flow.sort(scores, dim=-1, descending=True) base_indices = (flow.arange( b, dtype=flow.int64, device=offset_indices.device) * self.beam_width) base_indices = (base_indices.unsqueeze(1).repeat( [1, self.beam_width]).view(-1)) preds = preds.view(b * self.beam_width, -1) indices = offset_indices.view(-1) + base_indices # remove BOS sorted_preds = preds[indices].view(b, self.beam_width, -1) nbest_preds = sorted_preds[:, :min(self.beam_width, self.nbest), 1:] nbest_scores = sorted_scores[:, :min(self.beam_width, self.nbest)] return self.nbest_translate(nbest_preds), nbest_scores
def reset(self): self.val = flow.zeros_like(self.val) self.sum = flow.zeros_like(self.sum) self.count = flow.zeros_like(self.count)