def split_to_broadcast_job(x: oft.Numpy.Placeholder((96, 96))): with flow.scope.placement("gpu", "0:0-1"): src = flow.identity( x.with_distribute(flow.distribute.split(src_axis))) dst = flow.identity( src.with_distribute(flow.distribute.broadcast())) return dst
def partial_sum_to_broadcast_job(x: oft.Numpy.Placeholder((96, 96, 96))): with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)): src = flow.identity(x.with_distribute(flow.distribute.split(0))) src = flow.math.reduce_sum(src, axis=0) with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)): dst = flow.identity(src.with_distribute(flow.distribute.broadcast())) return dst
def partial_sum_to_split_job(x: oft.Numpy.Placeholder((96, 96, 96))): with flow.scope.placement("gpu", "0:0-1"): src = flow.identity(x.with_distribute(flow.distribute.split(0))) src = flow.math.reduce_sum(src, axis=0) dst = flow.identity( src.with_distribute(flow.distribute.split(dst_axis))) return dst
def test_job(x: oft.Numpy.Placeholder(input_shape, dtype=flow.float32),): v = flow.get_variable( name="v", shape=(1,), dtype=flow.float32, initializer=flow.zeros_initializer(), ) x = x + v x1 = flow.identity(x) x2 = flow.identity(x) flow.watch_diff(x1, test_global_storage.Setter("x1_diff")) flow.watch_diff(x2, test_global_storage.Setter("x2_diff")) x1 = flow.cast(x1, data_type) x2 = flow.cast(x2, data_type) y1 = flow.layers.batch_normalization_relu(x1, axis=axis, name="BN1") y2 = flow.math.relu(flow.layers.batch_normalization(x2, axis=axis, name="BN2")) y1 = flow.cast(y1, flow.float32) y2 = flow.cast(y2, flow.float32) flow.watch(y1, test_global_storage.Setter("y1")) flow.watch(y2, test_global_storage.Setter("y2")) loss = flow.math.reduce_mean(y1 + y2) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0 ).minimize(flow.math.reduce_sum(loss)) return loss
def split_to_split_job(x: oft.Numpy.Placeholder((32, 16, 64, 48))): with flow.scope.placement("gpu", "0:0-1"): src = flow.identity( x.with_distribute(flow.distribute.split(src_axis))) dst = flow.identity( src.with_distribute(flow.distribute.split(dst_axis))) return dst
def _dense_layer( inputs, units, activation=None, use_bias=True, kernel_initializer=None, bias_initializer=None, trainable=True, name=None, ): in_shape = inputs.shape in_num_axes = len(in_shape) assert in_num_axes >= 2 name_prefix = name if name is not None else id_util.UniqueStr("Dense_") inputs = flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs weight = flow.get_variable( name="{}-weight".format(name_prefix), shape=(units, inputs.shape[1]), dtype=inputs.dtype, initializer=(kernel_initializer if kernel_initializer is not None else flow.constant_initializer(0)), trainable=trainable, model_name="weight", ) weight = flow.identity(weight) weight = flow.repeat(weight, args.num_piece_in_batch) out = flow.matmul( a=inputs, b=weight, transpose_b=True, name="{}_matmul".format(name_prefix), ) if use_bias: bias = flow.get_variable( name="{}-bias".format(name_prefix), shape=(units, ), dtype=inputs.dtype, initializer=(bias_initializer if bias_initializer is not None else flow.constant_initializer(0)), trainable=trainable, model_name="bias", ) bias = flow.identity(bias) bias = flow.repeat(bias, args.num_piece_in_batch) out = flow.nn.bias_add(out, bias, name="{}_bias_add".format(name_prefix)) out = (activation(out, name="{}_activation".format(name_prefix)) if activation is not None else out) out = flow.reshape(out, in_shape[:-1] + (units, )) if in_num_axes > 2 else out return out
def broadcast_to_compatible_with_fn( x_def: oft.ListNumpy.Placeholder(x_shape, dtype=flow.float), a_def: oft.ListNumpy.Placeholder(a_shape, dtype=flow.float), b_def: oft.ListNumpy.Placeholder(b_shape, dtype=flow.float), ): return flow.broadcast_to_compatible_with( x_def, [flow.identity(a_def), flow.identity(b_def)])
def broadcast_to_broadcast_job(x: oft.Numpy.Placeholder((96, 96, 96))): with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)): src = flow.identity(x.with_distribute(flow.distribute.broadcast())) with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)): dst = flow.identity( src.with_distribute(flow.distribute.broadcast())) return dst
def split_to_broadcast_job(input_blob: oft.Numpy.Placeholder( (96, 96))): with flow.scope.placement("gpu", "0:0"): src = flow.identity( input_blob.with_distribute(flow.distribute.split(0))) with flow.scope.placement("gpu", ["0:0", "1:0"]): dst = flow.identity( src.with_distribute(flow.distribute.broadcast())) return dst
def build_s2s_all2all(input_blob, src_axis, dst_axis): with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)): src = flow.identity( input_blob.with_distribute(flow.distribute.split(src_axis))) with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)): dst = flow.identity( src.with_distribute(flow.distribute.split(dst_axis))) return dst
def build_b2b(input_blob, src_device_num, dst_device_num): with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)): src = flow.identity( input_blob.with_distribute(flow.distribute.broadcast())) with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)): dst = flow.identity( src.with_distribute(flow.distribute.broadcast())) return dst
def split_to_split_job(x: oft.Numpy.Placeholder((32, 16, 64, 48))): with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)): src = flow.identity( x.with_distribute(flow.distribute.split(src_axis))) with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)): dst = flow.identity( src.with_distribute(flow.distribute.split(dst_axis))) return dst
def build_p2b(input_blob, src_device_num, dst_device_num): with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)): src = flow.identity( input_blob.with_distribute(flow.distribute.split(0))) src = flow.math.reduce_sum(src, axis=0) with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)): dst = flow.identity( src.with_distribute(flow.distribute.broadcast())) return dst
def multi_lbi_job(x: oft.Numpy.Placeholder((96, 96, 96))): with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)): src_s0 = flow.identity(x.with_distribute(flow.distribute.split(0))) src_s1 = flow.identity(x.with_distribute(flow.distribute.split(1))) src_b = flow.identity(x.with_distribute(flow.distribute.split(1))) (t0_0, t0_1, t0_2) = flow.identity_n((src_s0, src_s1, src_b)) with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)): t0_0 = t0_0.with_distribute(flow.distribute.split(1)) t0_1 = t0_1.with_distribute(flow.distribute.broadcast()) t0_2 = t0_2.with_distribute(flow.distribute.split(1)) (t1_0, t1_1, t1_2) = flow.identity_n((t0_0, t0_1, t0_2)) return t1_0, t1_1, t1_2
def _conv2d_layer( args, name, input, filters, kernel_size=3, strides=1, padding="SAME", data_format="NCHW", dilation_rate=1, activation=op_conf_util.kRelu, use_bias=False, weight_initializer=flow.random_uniform_initializer(), bias_initializer=flow.random_uniform_initializer(), ): weight_shape = (filters, input.shape[1], kernel_size, kernel_size) weight = flow.get_variable( name + "-weight", shape=weight_shape, dtype=input.dtype, initializer=weight_initializer, ) weight = flow.identity(weight) weight = flow.repeat(weight, args.num_piece_in_batch) output = flow.nn.conv2d(input, weight, strides, padding, data_format, dilation_rate, name=name) if use_bias: bias = flow.get_variable( name + "-bias", shape=(filters, ), dtype=input.dtype, initializer=bias_initializer, ) bias = flow.identity(bias) bias = flow.repeat(bias, args.num_piece_in_batch) output = flow.nn.bias_add(output, bias, data_format) if activation is not None: if activation == op_conf_util.kRelu: output = flow.math.relu(output) else: raise NotImplementedError return output
def nvtx_range_job(x: oft.Numpy.Placeholder((4, 4, 1024, 1024))): x += flow.get_variable( name="v1", shape=(1, ), dtype=flow.float, initializer=flow.zeros_initializer(), ) x = flow.math.relu(x) x = flow.profiler.nvtx_start(x, mark_prefix="softmax") x = flow.nn.softmax(x) x = flow.nn.softmax(x) x = flow.nn.softmax(x) x = flow.nn.softmax(x) x = flow.nn.softmax(x) x = flow.profiler.nvtx_end(x, mark_prefix="softmax") x = flow.math.relu(x) x = flow.profiler.nvtx_start(x, mark_prefix="gelu") x = flow.math.gelu(x) x = flow.math.gelu(x) x = flow.math.gelu(x) x = flow.math.gelu(x) x = flow.math.gelu(x) x = flow.math.gelu(x) x = flow.profiler.nvtx_end(x, mark_prefix="gelu") flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0).minimize(x) return flow.identity(x)
def SoftmaxJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=x_shape, dtype=dtype, initializer=flow.random_uniform_initializer(minval=-1.0, maxval=1.0), trainable=True, ) x1 = x x = flow.identity(x) if data_type == "float16": loss = flow.cast( flow.nn.softmax(flow.cast(x, dtype=flow.float16), axis=axis), dtype=flow.float, ) else: loss = flow.nn.softmax(x, axis=axis) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) total_loss = loss * x1 flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(total_loss) return loss
def PartialFcJob(labels: oft.Numpy.Placeholder( (batch_size, ), dtype=type_name_to_flow_type[label_type])): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x-weight", shape=(num_classes, 128), dtype=flow.float, initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) with flow.scope.placement(device_type, "0:0-3"): lebels_distribute = flow.distribute.broadcast() weight_distribute = flow.distribute.split(0) ( maped_label, sampled_label, sampled_weight, ) = flow.distributed_partial_fc_sample( weight=x.with_distribute(weight_distribute), label=labels.with_distribute(lebels_distribute), num_sample=num_sample, ) with flow.scope.placement(device_type, "0:0"): sampled_weight = flow.identity(sampled_weight) loss = flow.math.square(sampled_weight) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch_diff(sampled_weight, test_global_storage.Setter("sampled_weight_diff")) return x, maped_label, sampled_label, sampled_weight
def SparseSoftmaxCrossEntropyWithLogitsJob(labels: oft.Numpy.Placeholder( (batch_size, ), dtype=type_name_to_flow_type[label_type])): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=(batch_size, num_classes), dtype=type_name_to_flow_type[data_type], initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) with flow.scope.placement(device_type, "0:0-3"): lebels_distribute = flow.distribute.broadcast() logits_distribute = flow.distribute.split(len(x.shape) - 1) loss = flow.nn.sparse_softmax_cross_entropy_with_logits( labels=labels.with_distribute(lebels_distribute), logits=x.with_distribute(logits_distribute), ) loss = flow.math.square(loss) with flow.scope.placement(device_type, "0:0"): loss = flow.identity(loss) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss
def dynamic_concat_job( input_0_def: oft.ListNumpy.Placeholder( shape=input_static_shape, dtype=flow.float ), input_1_def: oft.ListNumpy.Placeholder( shape=input_static_shape, dtype=flow.float ), ): var_0 = flow.get_variable( "Var0", shape=(1,), dtype=flow.float, initializer=flow.constant_initializer(value=1, dtype=flow.float), trainable=True, ) var_1 = flow.get_variable( "Var1", shape=(1,), dtype=flow.float, initializer=flow.constant_initializer(value=1, dtype=flow.float), trainable=True, ) var_0 = flow.cast_to_current_logical_view(var_0) var_1 = flow.cast_to_current_logical_view(var_1) input_0_def = flow.cast_to_current_logical_view(input_0_def) input_1_def = flow.cast_to_current_logical_view(input_1_def) if callable(watch_cb): flow.watch(var_0, watch_cb) flow.watch(var_1, watch_cb) flow.watch(flow.identity(input_0_def), watch_cb) flow.watch(flow.identity(input_1_def), watch_cb) var_0 = var_0 * input_0_def var_1 = var_1 * input_1_def if callable(watch_cb): flow.watch(var_0, watch_cb) flow.watch(var_1, watch_cb) result = flow.concat( [var_0, var_1], axis=axis, max_dim_size=input_static_shape[axis] ) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [1e-4]), momentum=0 ).minimize(result) flow.watch_diff(var_0, make_watch_diff_cb(0)) flow.watch_diff(var_1, make_watch_diff_cb(1)) return result
def DynamicBinaryJob(x: oft.ListNumpy.Placeholder((20, ))): print("in_shape: ", x.shape) with flow.scope.placement("cpu", "0:0"): out_list = flow.experimental.dynamic_binary_split(x, base_shift=4, out_num=6) id_out_list = [] for out_blob in out_list: print("out_shape: ", out_blob.shape) id_out_list.append(flow.identity(out_blob)) with flow.scope.placement("cpu", "1:0"): out1 = flow.experimental.dynamic_binary_concat(id_out_list, x) print("concat_shape: ", out1.shape) with flow.scope.placement("cpu", "0:0"): out2 = flow.identity(out1) print("return_shape: ", out2.shape) return out2
def cast_to_current_logical_view( x: remote_blob_util.BlobDef, ) -> remote_blob_util.BlobDef: if (isinstance(x, remote_blob_util.ConsistentBlob) and oneflow.scope.mirrored_view_enabled()) or ( isinstance(x, remote_blob_util.MirroredBlob) and oneflow.scope.consistent_view_enabled()): x = oneflow.identity(x) return x
def cast_to_current_logical_view( x: oneflow_api.BlobDesc, ) -> oneflow_api.BlobDesc: if (isinstance(x, oneflow_api.ConsistentBlob) and oneflow.scope.mirrored_view_enabled()) or ( isinstance(x, oneflow_api.MirroredBlob) and oneflow.scope.consistent_view_enabled()): x = oneflow.identity(x) return x
def test_job( x: oft.Numpy.Placeholder(input_shape, dtype=flow.float32), labels: oft.Numpy.Placeholder(label_shape, dtype=flow.int32), ): with flow.scope.placement("gpu", "0:0"): v = flow.get_variable( name="v", shape=(1, ), dtype=flow.float32, initializer=flow.zeros_initializer(), ) x = x + v x1 = flow.identity(x) x2 = flow.identity(x) flow.watch_diff(x1, test_global_storage.Setter("x1_diff")) flow.watch_diff(x2, test_global_storage.Setter("x2_diff")) x1 = flow.cast(x1, data_type) x2 = flow.cast(x2, data_type) with flow.scope.placement("gpu", "0:0-3"): y1 = (flow.combined_margin_loss( x1.with_distribute(flow.distribute.split(1)), labels.with_distribute(flow.distribute.broadcast()), m1, m2, m3, ) * s) y2 = margin_loss(m1, m2, m3, s, x2, labels) with flow.scope.placement("gpu", "0:0"): y1 = flow.cast(y1, flow.float) y2 = flow.cast(y2, flow.float) flow.watch(y1, test_global_storage.Setter("y1")) flow.watch(y2, test_global_storage.Setter("y2")) loss = y1 + y2 flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.001]), momentum=0).minimize(flow.math.reduce_sum(loss)) return loss
def slice(input_blob: oft.Numpy.Placeholder(shape=(2, 5, 4), dtype=flow.float)): x = flow.get_variable( shape=(2, 5, 4), dtype=flow.float, initializer=flow.random_uniform_initializer(0, 2), name="variable", ) x = flow.identity(x) flow.watch_diff(x, slice_grad_cb) y = flow.slice_v2(x, [(None, None, None), (2, -2, None)]) flow.losses.add_loss(y) return y
def slice(input_blob: oft.Numpy.Placeholder(shape=(2, 5, 4), dtype=flow.float)): x = flow.get_variable( shape=(2, 5, 4), dtype=flow.float, initializer=flow.random_uniform_initializer(0, 2), name="variable", ) x = flow.identity(x) flow.watch_diff(x, slice_grad_cb) y = flow.slice_v2(x, [(None, None, None), (2, -2, None)]) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler([], [1e-3]), momentum=0).minimize(y) return y
def ReduceMaxJob(x: oft.Numpy.Placeholder(input_shape, dtype=flow.float)): with flow.scope.placement(device_type, "0:0"): x += flow.get_variable( name="v1", shape=input_shape, dtype=flow.float, initializer=flow.zeros_initializer(), ) loss = flow.math.reduce_max(x, axis=axis, keepdims=keepdims) loss = flow.identity(loss) flow.losses.add_loss(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss
def ReduceMinJob(x: oft.Numpy.Placeholder(input_shape, dtype=flow.float)): with flow.scope.placement(device_type, "0:0"): x += flow.get_variable( name="v1", shape=input_shape, dtype=flow.float, initializer=flow.zeros_initializer(), ) loss = flow.math.reduce_min(x, axis=axis, keepdims=keepdims) loss = flow.identity(loss) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss
def two_stage_reduce_job(x: oft.Numpy.Placeholder((4, 20, 20, 20))): with flow.scope.placement(device_type, "0:0"): x += flow.get_variable( name="v1", shape=(1,), dtype=flow.float, initializer=flow.zeros_initializer(), ) with flow.scope.placement(device_type, "0:0-3"): loss = flow_func( x.with_distribute(flow.distribute.split(split_axis)), axis=axis, keepdims=True, ) loss = flow.identity(loss) flow.losses.add_loss(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) return loss
def ReduceMeanJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=input_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) loss = flow.math.reduce_mean(x, axis=axis, keepdims=keepdims) # TODO: fix facade and add_loss bug loss = flow.identity(loss) flow.losses.add_loss(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss