def __init__( self, layer_idx, normalized_shape, eps=1e-5, ): super().__init__() self.normalized_shape = normalized_shape self.epsilon = eps self.beta = flow.nn.Parameter( flow.empty( normalized_shape, dtype=flow.float32, placement=dist.get_layer_placement(layer_idx), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), )) flow.nn.init.zeros_(self.beta) self.gamma = flow.nn.Parameter( flow.empty( normalized_shape, dtype=flow.float32, placement=dist.get_layer_placement(layer_idx), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), )) flow.nn.init.ones_(self.gamma)
def __init__(self, layer_idx, input_size, output_size, init_method, need_gelu=False): super().__init__() self.need_gelu = need_gelu args = get_args() self.bias_gelu_fusion = args.bias_gelu_fusion # col parallel linear weight sbp: [B, S(1)] self.weight = flow.nn.Parameter( flow.empty( (input_size, output_size), dtype=flow.float32, placement=dist.get_layer_placement(layer_idx), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)]), )) init_method(self.weight) # col parallel linear bias sbp: [B, S(0)] self.bias = flow.nn.Parameter( flow.empty( (output_size, ), dtype=flow.float32, placement=dist.get_layer_placement(layer_idx), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]), )) flow.nn.init.zeros_(self.bias)
def __init__(self, seq_length, hidden_size, vocab_size): super().__init__() self.seq_length = seq_length self.hidden_size = hidden_size self.vocab_size = vocab_size args = get_args() self.dropout = flow.nn.Dropout(p=args.hidden_dropout) self.enable_amp = args.fp16 # word token embedding shape (vocab_size, hidden_size) # sbp: [B, S(0)] self.wte = flow.nn.Parameter( flow.empty( (self.vocab_size, self.hidden_size), dtype=flow.float32, placement=dist.get_layer_placement(0), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]), )) # word position embedding shape (seq_len, hidden_size) # sbp: [B, B] self.wpe = flow.nn.Parameter( flow.empty( (self.seq_length, self.hidden_size), dtype=flow.float32, placement=dist.get_layer_placement(0), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), )) flow.nn.init.normal_(self.wte, std=args.init_method_std) flow.nn.init.normal_(self.wpe, std=args.init_method_std)
def test_consistent_tensor_2d_sbp_init(test_case): V = 10 H = 4 S = 6 P = flow.placement("cuda", {0: [0, 1, 2, 3]}, (2, 2)) wte = flow.nn.Parameter( flow.empty( (V, H), dtype=flow.float32, placement=P, sbp=[flow.sbp.broadcast, flow.sbp.split(0)], ) ) wpe = flow.nn.Parameter( flow.empty( (S, H), dtype=flow.float32, placement=P, sbp=[flow.sbp.broadcast, flow.sbp.broadcast], ) ) flow.nn.init.normal_(wte, std=0.02) flow.nn.init.normal_(wpe, std=0.02)
def func(t): shape = t.shape dtype = t.dtype with oneflow._oneflow_internal.lazy_mode.guard(False): if t.is_consistent: eager_out = oneflow.empty( shape, dtype=dtype, placement=t.placement, sbp=t.sbp, ) else: eager_out = oneflow.empty(shape, dtype=dtype, device=t.device) return eager_out
def drop_connect(x, drop_ratio): keep_ratio = 1.0 - drop_ratio mask = oneflow.empty([x.shape[0], 1, 1, 1], dtype=x.dtype, device=x.device) mask.bernoulli_(keep_ratio) x.div_(keep_ratio) x.mul_(mask) return x
def test_to_placement(test_case): rank = flow.env.get_rank() # pid = os.getpid() # print(f"[{pid}][{rank}] ToConsistentGraphTestCase.test_to_placement") if rank == 0: x = flow.ones((2, 3), dtype=flow.float32) elif rank == 1: x = flow.empty(tuple()) else: raise ValueError c_x = x.to_consistent(placement=flow.placement("cpu", {0: [0]}), sbp=flow.sbp.broadcast) # print(f"c_x shape: {c_x.shape}, placment: {c_x.placement}, sbp: {c_x.sbp}") p1 = flow.placement("cpu", {0: [0, 1]}) m1 = ToPlacementModule(p1) g1 = MyGraph(m1) y1 = g1(c_x) # print(f"y1 shape: {y1.shape}, placment: {y1.placement}, sbp: {y1.sbp}") test_case.assertTrue(y1.placement == p1) test_case.assertTrue(y1.sbp[0] == flow.sbp.broadcast) test_case.assertTrue(y1.to_local().numpy().mean() == 1.0) p2 = flow.placement("cuda", {0: [0, 1]}) m2 = ToPlacementModule(p2) g2 = MyGraph(m2) y2 = g2(y1) # print(f"y2 shape: {y2.shape}, placment: {y2.placement}, sbp: {y2.sbp}") test_case.assertTrue(y2.placement == p2) test_case.assertTrue(y2.sbp[0] == flow.sbp.broadcast) test_case.assertTrue(y2.to_local().numpy().mean() == 1.0)
def gather(tensor, gather_list=None, dst=0): """ Gathers a list of tensors in a single process. Args: tensor (Tensor): Input tensor. gather_list (list[Tensor], optional): List of appropriately-sized tensors to use for gathered data (default is None, must be specified on the destination rank) dst (int, optional): Destination rank (default is 0) """ assert isinstance(tensor, flow._oneflow_internal.Tensor) assert tensor.is_local shape = tensor.shape dtype = tensor.dtype tensor = tensor.expand(*([1] + list(shape))) device_type = tensor.device.type placement = flow.env.all_device_placement(device_type) tensor = tensor.to_consistent(placement=placement, sbp=flow.sbp.split(0)).to_consistent( placement=placement, sbp=flow.sbp.broadcast) if gather_list is None: gather_list = [ flow.empty(shape, dtype=dtype) for _ in range(flow.env.get_world_size()) ] assert gather_list is not None assert isinstance(gather_list, list) assert len(gather_list) == flow.env.get_world_size() for i in range(tensor.shape[0]): gather_list[i] = tensor[i].to_local()
def __init__( self, num_parameters: int = 1, init: float = 0.25, device=None, dtype=None ) -> None: super().__init__() self.num_parameters = num_parameters self.weight = flow.nn.Parameter( flow.empty(num_parameters, dtype=dtype, device=device).fill_(init) )
def __init__(self, embedding_size, num_classes, cfg, partial_fc=False, bias=False): super(FC7, self).__init__() self.weight = flow.nn.Parameter(flow.empty(num_classes, embedding_size)) flow.nn.init.normal_(self.weight, mean=0, std=0.01) self.partial_fc = partial_fc size = flow.env.get_world_size() num_local = (cfg.num_classes + size - 1) // size self.num_sample = int(num_local * cfg.sample_rate) self.total_num_sample = self.num_sample * size
def _test_local_empty(test_case, shape, dtype, device, requires_grad): x = flow.empty( shape, dtype=dtype, device=flow.device(device), requires_grad=requires_grad if dtype == flow.float32 else False, ) test_case.assertFalse(x.is_global) test_case.assertEqual(x.shape, flow.Size(shape)) test_case.assertEqual(x.dtype, dtype) test_case.assertEqual(x.device, flow.device(device)) if dtype == flow.float32: test_case.assertEqual(x.requires_grad, requires_grad)
def build_real_output(fake_eager_out): lbn = out2name[fake_eager_out] + "/out" assert lbn in self._full_job_proto.helper.lbn2logical_blob_desc blob_conf = self._full_job_proto.helper.lbn2logical_blob_desc[lbn] shape = tuple(blob_conf.shape.dim) dtype = fake_eager_out.dtype with oneflow._oneflow_internal.lazy_mode.guard(False): if fake_eager_out.is_global: eager_out = oneflow.empty( shape, dtype=dtype, placement=fake_eager_out.placement, sbp=fake_eager_out.sbp, ) else: eager_out = oneflow.empty(shape, dtype=dtype, device=fake_eager_out.device) return eager_out
def __init__( self, layer_idx, input_size, output_size, init_method, dropout_rate, ): super().__init__() self.dropout_rate = dropout_rate args = get_args() self.bias_dropout_fusion = args.bias_dropout_fusion if not self.bias_dropout_fusion: self.dropout = flow.nn.Dropout(p=dropout_rate) # col parallel linear weight sbp: [B, S(0)] self.weight = flow.nn.Parameter( flow.empty( (input_size, output_size), dtype=flow.float32, placement=dist.get_layer_placement(layer_idx), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]), )) init_method(self.weight) # col parallel linear bias sbp: [B, B] self.bias = flow.nn.Parameter( flow.empty( (output_size, ), dtype=flow.float32, placement=dist.get_layer_placement(layer_idx), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), )) flow.nn.init.zeros_(self.bias)
def _test_consistent_empty(test_case, shape, dtype, placement, sbp, requires_grad): placement = flow.placement(placement, {0: [0]}) x = flow.empty( shape, dtype=dtype, placement=placement, sbp=sbp, requires_grad=requires_grad if dtype == flow.float32 else False, ) test_case.assertTrue(x.is_consistent) test_case.assertEqual(x.shape, flow.Size(shape)) test_case.assertEqual(x.dtype, dtype) test_case.assertEqual(x.placement, placement) test_case.assertEqual(x.sbp[0], sbp) if dtype == flow.float32: test_case.assertEqual(x.requires_grad, requires_grad)
def test_save_and_load(self): placement_arg = { "placement": flow.placement("cuda", ranks=[0]), "sbp": flow.sbp.broadcast, } graph = InferGraph(placement_arg) image_placeholder = flow.empty( (1, 3, 224, 224), dtype=flow.float32, placement=flow.placement("cpu", ranks=[0]), sbp=flow.sbp.broadcast, ) graph._compile(image_placeholder) saved_path = os.path.join("saved_model", graph.name) if not os.path.exists(saved_path): os.makedirs(saved_path) flow.save(graph, saved_path) saved_ir_path = os.path.join(saved_path, "model.mlir") serialized_job = oneflow._oneflow_internal.nn.graph.LoadSerializedJobFromIR( saved_ir_path) job = job_pb.Job() job.ParseFromString(serialized_job) op_list = [] op_list_ = [] for op in job.net.op: op_list.append(op) for op in graph._forward_job_proto.net.op: op_list_.append(op) def sort_by_op_name(op): return op.name op_list.sort(key=sort_by_op_name) op_list_.sort(key=sort_by_op_name) for (op, op_) in zip(op_list, op_list_): # TODO: convert loc in MLIR op_.ClearField("loc") self.assertTrue(op == op_, {"op": op, "op_": op_})
def _test_reshape_like_impl(test_case, pair, placement, in_sbp, like_sbp): shape, to_shape = pair nd_arr = np.random.rand(*shape) np_out = nd_arr.reshape(to_shape) x = flow.tensor(nd_arr) like = flow.empty(to_shape) y = x.to_global(flow.env.all_device_placement("cpu"), flow.sbp.broadcast).to_global(placement=placement, sbp=in_sbp) like = like.to_global(flow.env.all_device_placement("cpu"), flow.sbp.broadcast).to_global(placement=placement, sbp=like_sbp) z = flow._C.reshape_like(y, like) local_z = z.to_global( placement, sbp=[flow.sbp.broadcast for _ in range(len(placement.ranks.shape))]).to_local() if flow.env.get_rank() == 0: test_case.assertTrue(np.array_equal(np_out, local_z.numpy()))
def test_save_and_load(self): placement_arg = { "placement": flow.placement("cuda", ranks=[0]), "sbp": flow.sbp.broadcast, } graph = InferGraph(placement_arg) image_placeholder = flow.empty( (1, 3, 224, 224), dtype=flow.float32, placement=flow.placement("cpu", ranks=[0]), sbp=flow.sbp.broadcast, ) graph._compile(image_placeholder) saved_path = os.path.join("saved_model", graph.name) if not os.path.exists(saved_path): os.makedirs(saved_path) flow.save(graph, saved_path) saved_ir_path = os.path.join(saved_path, "model.mlir") serialized_job = oneflow._oneflow_internal.nn.graph.LoadSerializedJobFromIR( saved_ir_path) job = job_pb.Job() job.ParseFromString(serialized_job)
def load_bin_cv(path, image_size): bins, issame_list = pickle.load(open(path, "rb"), encoding="bytes") data_list = [] for flip in [0, 1]: data = flow.empty( len(issame_list) * 2, 3, image_size[0], image_size[1]) data_list.append(data) for i in range(len(issame_list) * 2): _bin = bins[i] img_ori = cv.imdecode(_bin, cv.IMREAD_COLOR)[:, :, ::-1] for flip in [0, 1]: img = img_ori.copy() if flip == 1: img = cv.flip(img, 1) img = np.array(img).transpose((2, 0, 1)) img = (img - 127.5) * 0.00784313725 data_list[flip][i] = flow.tensor(img, dtype=flow.float) if i % 1000 == 0: logging.info("loading bin:%d", i) logging.info(data_list[0].shape) return data_list, issame_list
def build(self): x = flow.empty(*shape, placement=placement, sbp=sbp) return x
def pad_packed_sequence( sequence: PackedSequence, batch_first: bool = False, padding_value: float = 0.0, total_length: Optional[int] = None, ) -> Tuple[Tensor, Tensor]: """The interface is consistent with PyTorch. The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pad_packed_sequence.html. Pads a packed batch of variable length sequences. It is an inverse operation to :func:`pack_padded_sequence`. The returned Tensor's data will be of size ``T x B x *``, where `T` is the length of the longest sequence and `B` is the batch size. If ``batch_first`` is True, the data will be transposed into ``B x T x *`` format. .. note:: :attr:`total_length` is useful to implement the ``pack sequence -> recurrent network -> unpack sequence`` pattern in a :class:`~oneflow.nn.Module` wrapped in :class:`~oneflow.nn.DataParallel`. See :ref:`this FAQ section <pack-rnn-unpack-with-data-parallelism>` for details. Args: sequence (PackedSequence): batch to pad batch_first (bool, optional): if ``True``, the output will be in ``B x T x *`` format. padding_value (float, optional): values for padded elements. total_length (int, optional): if not ``None``, the output will be padded to have length :attr:`total_length`. This method will throw :class:`ValueError` if :attr:`total_length` is less than the max sequence length in :attr:`sequence`. Returns: Tuple of Tensor containing the padded sequence, and a Tensor containing the list of lengths of each sequence in the batch. Batch elements will be re-ordered as they were ordered originally when the batch was passed to ``pack_padded_sequence`` or ``pack_sequence``. For example: .. code-block:: python >>> from oneflow.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence >>> import oneflow as flow >>> seq = flow.tensor([[4,5,6], [1,2,0], [3,0,0]]) >>> lens = [3, 2, 1] >>> packed = pack_padded_sequence(seq, lens, batch_first=True, enforce_sorted=True) >>> packed.data tensor([4, 1, 3, 5, 2, 6], dtype=oneflow.int64) >>> packed.batch_sizes tensor([3, 2, 1], dtype=oneflow.int64) >>> seq_unpacked, lens_unpacked = pad_packed_sequence(packed, batch_first=True) >>> seq_unpacked tensor([[4, 5, 6], [1, 2, 0], [3, 0, 0]], dtype=oneflow.int64) >>> lens_unpacked tensor([3., 2., 1.], dtype=oneflow.float32) """ max_seq_length = sequence.batch_sizes.shape[0] if total_length is not None: if total_length < max_seq_length: raise ValueError( "Expected total_length to be at least the length " "of the longest sequence in input, but got " "total_length={} and max sequence length being {}".format( total_length, max_seq_length)) else: total_length = max_seq_length batch_sizes_t = sequence.batch_sizes.contiguous() assert ( len(batch_sizes_t.shape) == 1 and batch_sizes_t.device.type == "cpu" and batch_sizes_t.dtype == flow.int64 ), f"'sequence.batch_sizes' should be a 1D CPU int64 tensor, but got {len(batch_sizes_t.shape)} D {batch_sizes_t.device.type} {batch_sizes_t.dtype} tensor" batch_sizes = batch_sizes_t.numpy() max_batch_size = int(batch_sizes[0]) max_real_seq_length = batch_sizes_t.shape[0] max_seq_length = max_real_seq_length if total_length > 0: assert ( total_length >= max_seq_length ), f"Expected total_length to be at least the length of the longest sequence in input, but got total_length={total_length} and max sequence length being {max_seq_length}" max_seq_length = total_length output_size = [ ] # == [max_seq_length, max_batch_size, *sequence.data.size()[1:]] output_size.append(max_seq_length) output_size.append(max_batch_size) output_size = output_size + list(sequence.data.shape[1:]) padded_output = flow.full( output_size, padding_value, dtype=sequence.data.dtype, device=sequence.data.device, requires_grad=sequence.data.requires_grad, ) # This will be modified at every iteration, but we reserve memory for it now. tmp_view_size = output_size # == [-1, -1, *sequence.data.size()[1:]] lengths = flow.empty(max_batch_size) data_offset = 0 prev_batch_size = max_batch_size prev_i = 0 lengths_idx = max_batch_size - 1 for i in range(max_real_seq_length + 1): batch_size = batch_sizes[i] if i != max_real_seq_length else 0 if batch_size != prev_batch_size: l = prev_batch_size * (i - prev_i) tmp_view_size[0] = i - prev_i tmp_view_size[1] = prev_batch_size padded_output[ prev_i:i, 0:prev_batch_size] = sequence.data[data_offset:data_offset + l].view(tmp_view_size) data_offset += l prev_i = i dec = prev_batch_size - batch_size if dec > 0: for j in range(dec): lengths[lengths_idx] = i lengths_idx = lengths_idx - 1 prev_batch_size = batch_size if batch_first: permute_dims = (1, 0) for i in range(2, padded_output.ndim): permute_dims.append(i) padded_output = padded_output.permute(permute_dims) unsorted_indices = sequence.unsorted_indices if unsorted_indices is not None: batch_dim = 0 if batch_first else 1 return ( padded_output.index_select(batch_dim, unsorted_indices), lengths[unsorted_indices], ) return padded_output, lengths
def _test_consistent_empty(test_case, shape, placement, sbp): x = flow.empty(*shape, placement=placement, sbp=sbp) test_case.assertEqual(x.shape, flow.Size(shape)) test_case.assertEqual(x.sbp, sbp) test_case.assertEqual(x.placement, placement)