def sort( input: oneflow._oneflow_internal.BlobDesc, axis: int = -1, direction: str = "ASCENDING", name: Optional[str] = None, ) -> oneflow._oneflow_internal.BlobDesc: """This operator sorts the input Blob at specified axis. Args: input (oneflow._oneflow_internal.BlobDesc): A Blob axis (int, optional): dimension to be sorted. Defaults to the last dim (-1) direction (str, optional): The direction in which to sort the Blob values. If the direction is "ASCENDING", The order of input will be sorted as ascending, else, the order of input will be sorted as descending. Defaults to "ASCENDING". name (Optional[str], optional): The name for the operation. Defaults to None. Returns: oneflow._oneflow_internal.BlobDesc: The sorted Blob For example: .. code-block:: python import oneflow.compatible.single_client as flow import numpy as np import oneflow.compatible.single_client.typing as tp @flow.global_function() def sort_Job(x: tp.Numpy.Placeholder((5, )) ) -> tp.Numpy: return flow.sort(input=x) x = np.array([10, 2, 9, 3, 7]).astype("float32") out = sort_Job(x) # out [ 2. 3. 7. 9. 10.] """ assert direction in ["ASCENDING", "DESCENDING"] name = name if name is not None else id_util.UniqueStr("Sort_") num_axes = len(input.shape) axis = axis if axis >= 0 else axis + num_axes assert 0 <= axis < num_axes, "axis out of range" if axis == num_axes - 1: return _sort_at_last_dim(input, direction, name) else: perm = get_perm_when_transpose_axis_to_last_dim(num_axes, axis) x = flow.transpose(input, perm, False, True, name + "_transpose") x = _sort_at_last_dim(x, direction, name) return flow.transpose(x, get_inversed_perm(perm), False, True, name + "_inverse_transpose")
def __call__(self, hidden_states): # hidden_states shape: (batch_size, seq_length, hidden_size) # or (seq_length, batch_size, hidden_size) [seq_len dim leading] # data parallel sbp: S(0) # 2d sbp: [S(0), B] assert len(hidden_states.shape) == 3 assert hidden_states.shape[-1] == self.hidden_size if (hidden_states.shape[0] == self.batch_size and hidden_states.shape[1] == self.seq_length): is_seq_len_dim_leading = False elif (hidden_states.shape[0] == self.seq_length and hidden_states.shape[1] == self.batch_size): is_seq_len_dim_leading = True else: raise ValueError( f"invalid hidden states shape {hidden_states.shape}") h = hidden_states with flow.scope.namespace("attn"): h = col_parallel_linear( "c_attn", h, self.hidden_size * 3, weight_initializer=self.initializer, ) if self.multihead_attention_fusion: h = self.fused_multihead_attn(h) else: q, k, v = self.query_key_value(h) h = self.multihead_attn(q, k, v) if is_seq_len_dim_leading: # (b, n, s, h) -> (s, b, n, h) h = flow.transpose(h, [2, 0, 1, 3]) else: # (b, n, s, h) -> (b, s, n, h) h = flow.transpose(h, [0, 2, 1, 3]) # (b, s, n, h) -> (b, s, H) or (s, b, n, h) -> (s, b, H) h = flow.flatten(h, start_dim=2) h = row_parallel_linear( "c_proj", h, self.hidden_size, weight_initializer=self.output_layer_initializer, dropout_rate=self.hidden_dropout_rate, bias_dropout_fusion=self.bias_dropout_fusion, ) return h
def logits(self, hidden_states, token_embeddings): """ shape sig: (batch_size * seq_length, hidden_size) x (hidden_size, vocab_size)(transposed) -> (batch_size * seq_length, vocab_size) dp sbp sig: S(0) x B -> S(0) 2d sbp sig: [S(0), B] x [B, S(1)](transposed) -> [S(0), S(1)] """ assert len(hidden_states.shape) == 3 assert np.prod( hidden_states.shape[0:2]) == self.batch_size * self.seq_length assert hidden_states.shape[-1] == self.hidden_size assert len(token_embeddings.shape) == 2 assert token_embeddings.shape[0] == self.vocab_size assert token_embeddings.shape[1] == self.hidden_size with distribute.layer_placement_scope(-1): if (hidden_states.shape[0] == self.seq_length and hidden_states.shape[1] == self.batch_size): # [s, b, H] -> [b, s, H] h = flow.transpose(hidden_states, [1, 0, 2]) elif (hidden_states.shape[0] == self.batch_size and hidden_states.shape[1] == self.seq_length): h = hidden_states else: raise ValueError( f"invalid hidden states shape {hidden_states.shape}") # [s, b, H] or [b, s, H] -> [b * s, H] h = flow.flatten(h, start_dim=0, end_dim=1) # 2d sbp sig: [S(0), B] x [B, S(1)](transposed) -> [S(0), S(1)] # grad 2d sbp sig: [S(0), S(1)] x [B, S(0)] -> [S(0), P] -> [S(0), B] h = distribute.backward_p2b_parallel_cast(h) lgs = flow.matmul(h, token_embeddings, transpose_b=True) return lgs
def query_key_value(self, h): """ Split input to q, k, v and split hidden states into heads, shape: (batch_size, seq_length, hidden_size) -> (batch_size, seq_length, num_attn_heads, head_size) -> (batch_size, num_attn_heads, seq_length, head_size) """ assert len(h.shape) == 3 # Note: 3 is between num_heads and head_size that ensure the features of heads of q, k, v is contiguously arranged new_shape = ( h.shape[0], h.shape[1], self.num_heads, 3 * self.head_size, ) if h.shape[0] == self.seq_length and h.shape[1] == self.batch_size: perm = [1, 2, 0, 3] elif h.shape[0] == self.batch_size and h.shape[1] == self.seq_length: perm = [0, 2, 1, 3] else: raise ValueError h = flow.reshape(h, new_shape) q, k, v = (flow.transpose( flow.slice( h, begin=[None, None, None, i * self.head_size], size=[None, None, None, self.head_size], ), perm=perm, ) for i in range(3)) return q, k, v
def __call__(self, hidden_states): """ hidden_states shape: (batch_size, seq_length, hidden_size) data parallel sbp: S(0) 2d sbp: [S(0), B] """ assert len(hidden_states.shape) == 3 assert hidden_states.shape[0] == self.batch_size assert hidden_states.shape[1] == self.seq_length assert hidden_states.shape[2] == self.hidden_size if self.multihead_attention_fusion: with distribute.layer_placement_scope(0): # [b, s, H] -> [s, b, H] for multihead_attention_fusion h = flow.transpose(hidden_states, [1, 0, 2]) else: h = hidden_states for i in range(self.num_layers): with distribute.layer_placement_scope(i): h = self.layers[i](h) # final layernorm with distribute.layer_placement_scope(-1): h = layernorm("layernorm_f", h) return h
def gram_matrix(input): b = input.shape[0] ch = input.shape[1] h = input.shape[2] w = input.shape[3] features = flow.reshape(input, [b, ch, h * w]) features_t = flow.transpose(features, [0, 2, 1]) gram = flow.matmul(features, features_t) / (ch * h * w) return gram
def alexnet_inference( image: flow.typing.Numpy.Placeholder(image_shape, dtype=flow.float32), label: flow.typing.Numpy.Placeholder(label_shape, dtype=flow.int32), ) -> flow.typing.Numpy: input_lbns["image"] = image.logical_blob_name input_lbns["label"] = label.logical_blob_name image = flow.transpose(image, perm=(0, 3, 1, 2)) loss = alexnet(image, label, trainable=False) output = loss output_lbns["output"] = output.logical_blob_name return output
def resnet50( images, trainable=True, need_transpose=False, training=True, wd=1.0 / 32768, channel_last=False, ): weight_regularizer = flow.regularizers.l2(wd) if wd > 0.0 and wd < 1.0 else None builder = ResnetBuilder(weight_regularizer, trainable, training, channel_last) if need_transpose: images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2]) if channel_last: images = flow.transpose(images, name="transpose", perm=[0, 2, 3, 1]) with flow.scope.namespace("Resnet"): stem = builder.resnet_stem(images) body = builder.resnet_conv_x_body(stem) pool5 = flow.nn.avg_pool2d( body, ksize=7, strides=1, padding="VALID", data_format=builder.data_format, name="pool5", ) fc1001 = flow.layers.dense( flow.reshape(pool5, (pool5.shape[0], -1)), units=1000, use_bias=True, kernel_initializer=flow.variance_scaling_initializer( 2, "fan_in", "random_normal" ), bias_initializer=flow.zeros_initializer(), kernel_regularizer=weight_regularizer, bias_regularizer=weight_regularizer, trainable=trainable, name="fc1001", ) return fc1001
def TransposeJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "input", shape=input_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=2, maxval=5), trainable=True, ) loss = flow.transpose(x, perm) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.0001]), momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss
def self_attn_qk_v_fw_bw( h: flow.typing.Numpy.Placeholder( shape=(seq_len, batch_size, hidden_size), dtype=flow.float32 ) ) -> typing.Tuple[flow.typing.Numpy, flow.typing.Numpy]: var = flow.get_variable( "var", shape=(1,), dtype=flow.float32, initializer=flow.constant_initializer(1.0, dtype=flow.float32), trainable=True, ) h = h * var if fused: flow.watch_diff(h, test_global_storage.Setter("h_grad_fused")) else: flow.watch_diff(h, test_global_storage.Setter("h_grad")) if fp16: h = flow.amp_white_identity(h) alpha = get_alpha(head_size) if fused: (qmk, v) = flow.nn.fused_self_attention_query_mul_key_and_value( h, head_size=head_size, alpha=alpha ) else: h = flow.reshape(h, (seq_len, batch_size, -1, 3 * head_size)) (q, k, v) = ( flow.transpose( flow.slice( h, begin=[None, None, None, head_size * i], size=[None, None, None, head_size], ), perm=[1, 2, 0, 3], ) for i in range(3) ) qmk = flow.matmul(q, k, transpose_b=True, alpha=alpha) h = flow.matmul(qmk, v) loss = flow.math.reduce_sum(h) flow.optimizer.SGD(get_lr_scheduler(), momentum=0).minimize(loss) return (qmk, v)
def trt_transpose_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)): return flow.transpose(x, perm=permute)
def TransposeForScores(input_blob, num_attention_heads, seq_length, width): output_blob = flow.reshape( input_blob, [-1, seq_length, num_attention_heads, width] ) output_blob = flow.transpose(output_blob, perm=[0, 2, 1, 3]) return output_blob
def _AttentionLayer( from_blob, to_blob, attention_mask_blob, num_attention_heads=1, size_per_head=512, query_act=op_conf_util.kNone, key_act=op_conf_util.kNone, value_act=op_conf_util.kNone, attention_probs_dropout_prob=0.0, initializer_range=0.02, do_return_2d_tensor=False, batch_size=None, from_seq_length=None, to_seq_length=None, ): def TransposeForScores(input_blob, num_attention_heads, seq_length, width): output_blob = flow.reshape( input_blob, [-1, seq_length, num_attention_heads, width] ) output_blob = flow.transpose(output_blob, perm=[0, 2, 1, 3]) return output_blob from_blob_2d = flow.reshape(from_blob, [-1, num_attention_heads * size_per_head]) to_blob_2d = flow.reshape(to_blob, [-1, num_attention_heads * size_per_head]) query_blob = _FullyConnected( from_blob_2d, input_size=num_attention_heads * size_per_head, units=num_attention_heads * size_per_head, activation=query_act, name="query", weight_initializer=CreateInitializer(initializer_range), ) key_blob = _FullyConnected( to_blob_2d, input_size=num_attention_heads * size_per_head, units=num_attention_heads * size_per_head, activation=key_act, name="key", weight_initializer=CreateInitializer(initializer_range), ) value_blob = _FullyConnected( to_blob_2d, input_size=num_attention_heads * size_per_head, units=num_attention_heads * size_per_head, activation=value_act, name="value", weight_initializer=CreateInitializer(initializer_range), ) query_blob = TransposeForScores( query_blob, num_attention_heads, from_seq_length, size_per_head ) key_blob = TransposeForScores( key_blob, num_attention_heads, to_seq_length, size_per_head ) attention_scores_blob = flow.matmul(query_blob, key_blob, transpose_b=True) attention_scores_blob = attention_scores_blob * ( 1.0 / math.sqrt(float(size_per_head)) ) attention_mask_blob = flow.reshape( attention_mask_blob, [-1, 1, from_seq_length, to_seq_length] ) attention_mask_blob = flow.cast(attention_mask_blob, dtype=flow.float) addr_blob = (attention_mask_blob - 1.0) * 10000.0 attention_scores_blob = attention_scores_blob + addr_blob attention_probs_blob = flow.nn.softmax(attention_scores_blob) attention_probs_blob = _Dropout(attention_probs_blob, attention_probs_dropout_prob) value_blob = flow.reshape( value_blob, [-1, to_seq_length, num_attention_heads, size_per_head] ) value_blob = flow.transpose(value_blob, perm=[0, 2, 1, 3]) context_blob = flow.matmul(attention_probs_blob, value_blob) context_blob = flow.transpose(context_blob, perm=[0, 2, 1, 3]) if do_return_2d_tensor: context_blob = flow.reshape( context_blob, [-1, num_attention_heads * size_per_head] ) else: context_blob = flow.reshape( context_blob, [-1, from_seq_length, num_attention_heads * size_per_head] ) return context_blob
def one_hot( indices: oneflow._oneflow_internal.BlobDesc, depth: int, on_value: Union[int, float] = 1, off_value: Union[int, float] = 0, axis: int = -1, dtype: Optional[flow.dtype] = None, name: Optional[str] = None, ) -> oneflow._oneflow_internal.BlobDesc: """This operator generates a onehot Blob from input Blob. If input Blob's rank is `N`, the corresponding onehot Blob's rank is `N+1`. The new axis is generated on the specified dimension according to the parameter `axis`. The locations represented by `indices` take value `on_value`, while other locations take `off_value` Args: indices (oneflow._oneflow_internal.BlobDesc): The input Blob. depth (int): The length of onehot Blob. on_value (Union[int, float], optional): The fill value when `indices[i] == i`. Defaults to 1. off_value (Union[int, float], optional): The fill value when `indice[i] != i`. Defaults to 0. axis (int, optional): The specified dimension that the new axis is generated on. Defaults to -1. dtype (Optional[flow.dtype], optional): The output data type, it can be "oneflow.compatible.single_client.int32", "oneflow.compatible.single_client.int64", "oneflow.compatible.single_client.float", "oneflow.compatible.single_client.double". Defaults to None. name (Optional[str], optional): The name for the operation. Defaults to None. Note: The data type of input blob should be `int32` or `int64` For example: Example 1: .. code-block:: python import oneflow.compatible.single_client as flow import oneflow.compatible.single_client.typing as tp import numpy as np @flow.global_function() def onehot_Job(x: tp.Numpy.Placeholder((4, ), dtype=flow.int32) ) -> tp.Numpy: return flow.one_hot(indices=x, depth=5, axis=-1, dtype=flow.int32) x = np.array([0, 3, 1, 2]).astype(np.int32) out = onehot_Job(x) # out [[1 0 0 0 0] # [0 0 0 1 0] # [0 1 0 0 0] # [0 0 1 0 0]] Example 2: .. code-block:: python import oneflow.compatible.single_client as flow import oneflow.compatible.single_client.typing as tp import numpy as np @flow.global_function() def onehot_Job(x: tp.Numpy.Placeholder((4, ), dtype=flow.int32) ) -> tp.Numpy: return flow.one_hot(indices=x, depth=5, axis=0, dtype=flow.int32) x = np.array([0, 3, 1, 2]).astype(np.int32) out = onehot_Job(x) # out [[1 0 0 0] # [0 0 1 0] # [0 0 0 1] # [0 1 0 0] # [0 0 0 0]] Returns: oneflow._oneflow_internal.BlobDesc: [description] """ out_ndims = len(indices.shape) + 1 if axis < 0: axis += out_ndims assert axis >= 0 and axis < out_ndims, ValueError( "Expected axis to between [%d, %d). But received: %d " % (-out_ndims, out_ndims, axis)) out = (flow.user_op_builder(name if name is not None else id_util. UniqueStr("OneHot_")).Op("one_hot").Input( "indices", [indices]).Attr("depth", int(depth)).Attr( "floating_on_value", float(on_value)).Attr( "integer_on_value", int(on_value)).Attr( "floating_off_value", float(off_value)).Attr( "integer_off_value", int(off_value)).Attr( "dtype", dtype). Output("out").Build().InferAndTryRun().RemoteBlobList()[0]) if axis != out_ndims - 1: dim_list = list(range(0, out_ndims)) dim_list.insert(axis, out_ndims - 1) dim_list.pop() return flow.transpose(out, dim_list) else: return out
def inceptionv3(images, trainable=True, channel_last=False): if channel_last: # if channel_last=True, then change mode from 'nchw' to 'nhwc' images = flow.transpose(images, name="transpose", perm=[0, 2, 3, 1]) with flow.scope.namespace("InceptionV3"): # conv0: 299 x 299 x 3 conv0 = conv2d_layer("conv0", images, filters=32, kernel_size=3, strides=2, padding="VALID") conv1 = conv2d_layer("conv1", conv0, filters=32, kernel_size=3, strides=1, padding="VALID") conv2 = conv2d_layer("conv2", conv1, filters=64, kernel_size=3, strides=1, padding="SAME") pool1 = flow.nn.max_pool2d(conv2, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool1") conv3 = conv2d_layer("conv3", pool1, filters=80, kernel_size=1, strides=1, padding="VALID") conv4 = conv2d_layer("conv4", conv3, filters=192, kernel_size=3, strides=1, padding="VALID") pool2 = flow.nn.max_pool2d(conv4, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool2") # mixed_0 ~ mixed_2 mixed_0 = InceptionA(pool2, 0) mixed_1 = InceptionA(mixed_0, 1) mixed_2 = InceptionA(mixed_1, 2) # mixed_3 mixed_3 = InceptionB(mixed_2, 3) # mixed_4 ~ mixed_7 mixed_4 = InceptionC(mixed_3, 4, 128) mixed_5 = InceptionC(mixed_4, 5, 160) mixed_6 = InceptionC(mixed_5, 6, 160) mixed_7 = InceptionC(mixed_6, 7, 192) # mixed_8 mixed_8 = InceptionD(mixed_7, 8) # mixed_9 ~ mixed_10 mixed_9 = InceptionE(mixed_8, 9, "avg") mixed_10 = InceptionE(mixed_9, 10, "max") pool3 = flow.nn.avg_pool2d( mixed_10, ksize=8, strides=1, padding="VALID", data_format="NCHW", name="pool3", ) # TODO: Need to transpose weight when converting model from TF to OF if # you want to use layers.dense interface. fc1 = flow.layers.dense( inputs=flow.reshape(pool3, [pool3.shape[0], -1]), units=1000, activation=None, use_bias=True, kernel_initializer=flow.truncated_normal(0.816496580927726), bias_initializer=flow.constant_initializer(), trainable=trainable, name="fc1", ) return fc1