def _model(dense_fields, wide_sparse_fields, deep_sparse_fields): # wide_embedding = _embedding('wide_embedding', wide_sparse_fields, 1, FLAGS.wide_vocab_size) wide_embedding = _hybrid_embedding('wide_embedding', wide_sparse_fields, 1, FLAGS.wide_vocab_size, FLAGS.hf_wide_vocab_size) wide_scores = flow.math.reduce_sum(wide_embedding, axis=[1], keepdims=True) wide_scores = flow.parallel_cast(wide_scores, distribute=flow.distribute.split(0), gradient_distribute=flow.distribute.broadcast()) # deep_embedding = _embedding('deep_embedding', deep_sparse_fields, FLAGS.deep_embedding_vec_size, # FLAGS.deep_vocab_size, split_axis=1) deep_embedding = _hybrid_embedding('deep_embedding', deep_sparse_fields, FLAGS.deep_embedding_vec_size, FLAGS.deep_vocab_size, FLAGS.hf_deep_vocab_size) deep_features = flow.concat([deep_embedding, dense_fields], axis=1) for idx, units in enumerate(DEEP_HIDDEN_UNITS): deep_features = flow.layers.dense( deep_features, units=units, kernel_initializer=flow.glorot_uniform_initializer(), bias_initializer=flow.constant_initializer(0.0), activation=flow.math.relu, name='fc' + str(idx + 1) ) deep_features = flow.nn.dropout(deep_features, rate=FLAGS.deep_dropout_rate) deep_scores = flow.layers.dense( deep_features, units=1, kernel_initializer=flow.glorot_uniform_initializer(), bias_initializer=flow.constant_initializer(0.0), name='fc' + str(len(DEEP_HIDDEN_UNITS) + 1) ) scores = wide_scores + deep_scores return scores
def do_tensor_scatter_nd_add(params_blob, indices_blob, updates_blob): with flow.scope.placement(device_type, "0:0"): params_var = flow.get_variable( "params", shape=params_blob.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), ) updates_var = flow.get_variable( "updates", shape=updates_blob.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), ) params_var = flow.cast_to_current_logical_view(params_var) params_blob = flow.cast_to_current_logical_view(params_blob) updates_blob = flow.cast_to_current_logical_view(updates_blob) updates_var = flow.cast_to_current_logical_view(updates_var) params_var = params_var + params_blob updates_var = updates_var + updates_blob out = flow.tensor_scatter_nd_add(params_var, indices_blob, updates_var) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.001]), momentum=0).minimize(out) flow.watch_diff(params_var, params_grad_watcher) flow.watch_diff(updates_var, updates_grad_watcher) return out
def slice_update_train_job( x: otp.Numpy.Placeholder(shape=input_shape, dtype=dtype), update: otp.Numpy.Placeholder(shape=update_shape, dtype=dtype), ) -> otp.Numpy: x_var = flow.get_variable( shape=input_shape, dtype=dtype, initializer=flow.constant_initializer(0.0), name="x", ) update_var = flow.get_variable( shape=update_shape, dtype=dtype, initializer=flow.constant_initializer(0.0), name="update", ) x = x + x_var update = update + update_var if callable(diff_watcher_maker): flow.watch_diff(x, diff_watcher_maker(input_shape)) flow.watch_diff(update, diff_watcher_maker(update_shape)) y = flow.slice_update(x, update, slice_tup_list) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0 ).minimize(y) return y
def schedule_job(count_not_finite: oft.Numpy.Placeholder( (1, ), dtype=flow.int64)): with flow.scope.placement(device_type, "0:0"): good_step_counter = flow.get_variable( name="good_step_counter", shape=(1, ), dtype=flow.int64, initializer=flow.constant_initializer( op_param["good_step_counter_value"], dtype=flow.int64), ) loss_scale = flow.get_variable( name="loss_scale", shape=(1, ), dtype=flow.float, initializer=flow.constant_initializer( op_param["loss_scale_value"], dtype=flow.float), ) dynamic_loss_scale_schedule( count_not_finite, loss_scale, good_step_counter, op_param["increment_period"], op_param["multiplier"], "dynamic_schedule", ) return (good_step_counter, loss_scale)
def do_where(condition, x, y): with flow.scope.placement(device_type, "0:0"): x_var = flow.get_variable( "x", shape=x.shape, dtype=flow.float, initializer=flow.constant_initializer(0), ) x_var = flow.cast_to_current_logical_view(x_var) x_var = x_var + x y_var = flow.get_variable( "y", shape=y.shape, dtype=flow.float, initializer=flow.constant_initializer(0), ) y_var = flow.cast_to_current_logical_view(y_var) y_var = y_var + y z = flow.where(condition, x_var, y_var) with flow.scope.placement(device_type, "0:0"): flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0 ).minimize(z) flow.watch_diff(x_var, dz_dx_watcher) flow.watch_diff(y_var, dz_dy_watcher) return z
def scatter_nd_update_grad_fn( x_def: oft.Numpy.Placeholder(params.shape, dtype=flow.float), indices_def: oft.Numpy.Placeholder(indices.shape, dtype=flow.int32), y_def: oft.Numpy.Placeholder(updates.shape, dtype=flow.float), ): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "params", shape=params.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), ) y = flow.get_variable( "updates", shape=updates.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), ) x = x + x_def y = y + y_def z = flow.tensor_scatter_nd_update(x, indices_def, y) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.001]), momentum=0).minimize(z) flow.watch_diff(x, compare_dz_dx) flow.watch_diff(y, compare_dz_dy) return z
def _model(dense_fields, wide_sparse_fields, deep_sparse_fields): wide_sparse_fields = flow.parallel_cast( wide_sparse_fields, distribute=flow.distribute.broadcast()) wide_embedding_table = flow.get_variable( name='wide_embedding', shape=(FLAGS.wide_vocab_size, 1), initializer=flow.random_uniform_initializer(minval=-0.05, maxval=0.05), distribute=flow.distribute.split(0), ) wide_embedding = flow.gather(params=wide_embedding_table, indices=wide_sparse_fields) wide_embedding = flow.reshape(wide_embedding, shape=(-1, wide_embedding.shape[-1] * wide_embedding.shape[-2])) wide_scores = flow.math.reduce_sum(wide_embedding, axis=[1], keepdims=True) wide_scores = flow.parallel_cast( wide_scores, distribute=flow.distribute.split(0), gradient_distribute=flow.distribute.broadcast()) deep_sparse_fields = flow.parallel_cast( deep_sparse_fields, distribute=flow.distribute.broadcast()) deep_embedding_table = flow.get_variable( name='deep_embedding', shape=(FLAGS.deep_vocab_size, FLAGS.deep_embedding_vec_size), initializer=flow.random_uniform_initializer(minval=-0.05, maxval=0.05), distribute=flow.distribute.split(1), ) deep_embedding = flow.gather(params=deep_embedding_table, indices=deep_sparse_fields) deep_embedding = flow.parallel_cast( deep_embedding, distribute=flow.distribute.split(0), gradient_distribute=flow.distribute.split(2)) deep_embedding = flow.reshape(deep_embedding, shape=(-1, deep_embedding.shape[-1] * deep_embedding.shape[-2])) deep_features = flow.concat([deep_embedding, dense_fields], axis=1) for idx, units in enumerate(DEEP_HIDDEN_UNITS): deep_features = flow.layers.dense( deep_features, units=units, kernel_initializer=flow.glorot_uniform_initializer(), bias_initializer=flow.constant_initializer(0.0), activation=flow.math.relu, name='fc' + str(idx + 1)) deep_features = flow.nn.dropout(deep_features, rate=FLAGS.deep_dropout_rate) deep_scores = flow.layers.dense( deep_features, units=1, kernel_initializer=flow.glorot_uniform_initializer(), bias_initializer=flow.constant_initializer(0.0), name='fc' + str(len(DEEP_HIDDEN_UNITS) + 1)) scores = wide_scores + deep_scores return scores
def _dense_layer( inputs, units, activation=None, use_bias=True, kernel_initializer=None, bias_initializer=None, trainable=True, name=None, ): in_shape = inputs.shape in_num_axes = len(in_shape) assert in_num_axes >= 2 name_prefix = name if name is not None else id_util.UniqueStr("Dense_") inputs = flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs weight = flow.get_variable( name="{}-weight".format(name_prefix), shape=(units, inputs.shape[1]), dtype=inputs.dtype, initializer=kernel_initializer if kernel_initializer is not None else flow.constant_initializer(0), trainable=trainable, model_name="weight", ) weight = flow.identity(weight) weight = flow.repeat(weight, args.num_piece_in_batch) out = flow.matmul(a=inputs, b=weight, transpose_b=True, name="{}_matmul".format(name_prefix)) if use_bias: bias = flow.get_variable( name="{}-bias".format(name_prefix), shape=(units, ), dtype=inputs.dtype, initializer=bias_initializer if bias_initializer is not None else flow.constant_initializer(0), trainable=trainable, model_name="bias", ) bias = flow.identity(bias) bias = flow.repeat(bias, args.num_piece_in_batch) out = flow.nn.bias_add(out, bias, name="{}_bias_add".format(name_prefix)) out = (activation(out, name="{}_activation".format(name_prefix)) if activation is not None else out) out = flow.reshape(out, in_shape[:-1] + (units, )) if in_num_axes > 2 else out return out
def deconv2d( input, filters, size, name, strides=2, trainable=True, reuse=False, const_init=False, use_bias=False, ): name_ = name if reuse == False else name + "_reuse" weight_shape = (input.shape[1], filters, size, size) output_shape = ( input.shape[0], filters, input.shape[2] * strides, input.shape[3] * strides, ) weight = flow.get_variable( name + "-weight", shape=weight_shape, dtype=input.dtype, initializer=flow.random_normal_initializer(stddev=0.02) if not const_init else flow.constant_initializer(0.002), trainable=trainable, reuse=reuse, ) output = flow.nn.conv2d_transpose( input, weight, strides=[strides, strides], output_shape=output_shape, padding="SAME", data_format="NCHW", name=name_, ) if use_bias: bias = flow.get_variable( name + "-bias", shape=(filters, ), dtype=input.dtype, initializer=flow.constant_initializer(0.0), trainable=trainable, reuse=reuse, ) output = flow.nn.bias_add(output, bias, "NCHW") return output
def broadcast_to_compatible_with_fn( x_def: oft.Numpy.Placeholder(x.shape, dtype=flow.float) ): x_var = flow.get_variable( "x_var", shape=x.shape, dtype=flow.float, initializer=flow.constant_initializer(0), trainable=True, ) compatible_var = [ flow.get_variable( "compatible_var_{}".format(i), shape=cp_shape, dtype=flow.float, initializer=flow.random_normal_initializer(), trainable=False, ) for (i, cp_shape) in enumerate(compatible_shape) ] x_var = x_var + x_def y = flow.broadcast_to_compatible_with(x_var, compatible_var) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0 ).minimize(y) flow.watch_diff(x_var, dx_watcher) return y
def DistributeConcat(): with flow.scope.placement("gpu", "0:0"): w = flow.get_variable( "w", (2, 5), initializer=flow.constant_initializer(10)) x = w + 1 y = w + 1 ret = flow.advanced.distribute_concat([x, y])
def oneflow_l1loss( of_input: tp.Numpy.Placeholder(shape=input.shape), of_target: tp.Numpy.Placeholder(shape=target.shape), ) -> Dict[str, tp.Numpy]: with flow.scope.placement(device_type, "0:0"): v = flow.get_variable( shape=target.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), name="v", ) x_var = of_input + v flow.watch_diff(x_var, assert_prediction_grad) l1loss = flow.nn.L1Loss(x_var, of_target, reduction="none", name="of_l1loss") l1loss_mean = flow.nn.L1Loss( x_var, of_target, reduction="mean", name="of_l1loss_mean" ) l1loss_sum = flow.nn.L1Loss( x_var, of_target, reduction="sum", name="of_l1loss_sum" ) with flow.scope.placement(device_type, "0:0"): flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0 ).minimize(l1loss_mean) return { "of_l1_loss": l1loss, "of_l1_loss_mean": l1loss_mean, "of_l1_loss_sum": l1loss_sum, }
def oneflow_range_gpu() -> List[tp.Numpy]: with flow.scope.placement(device_type, machine_ids): out_1 = flow.range(1, 10, 3, dtype=flow.float64, name="range_float64") out_2 = flow.range(3, 6, 1, dtype=flow.float32, name="range_float32") out_3 = flow.range(4, 13, 4, dtype=flow.float32, name="range_float16") out_3 = flow.cast(out_3, dtype=flow.float32) out_4 = flow.range(3, dtype=flow.int32, name="range_int32") out_5 = flow.range(0, 6, 2, dtype=flow.int64, name="range_int64") x_var = flow.get_variable( "gpu_input", shape=(3, ), dtype=flow.float32, initializer=flow.constant_initializer(0.0), ) x_gpu_out = x_var + out_2 flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.001]), momentum=0).minimize(x_gpu_out) return [out_1, out_2, out_3, out_4, out_5]
def pooling_job(x: tensor_def(x_shape, dtype=dtype)): v = flow.get_variable( "x", shape=x_shape, dtype=dtype, initializer=flow.constant_initializer(0), trainable=True, ) v = flow.cast_to_current_logical_view(v) flow.watch_diff(v, assert_grad) x += v with flow.scope.placement(device_type, "0:0"): pooling_f = None if pooling_type == "AVG": pooling_f = getattr(flow.nn, "avg_pool{}d".format(dim)) elif pooling_type == "MAX": pooling_f = getattr(flow.nn, "max_pool{}d".format(dim)) else: raise ValueError("pooling_type must be AVG or MAX") y = pooling_f( x, ksize=ksize, strides=strides, padding=padding, data_format=data_format, ) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0 ).minimize(y) return y
def _AddClassficationLoss(input_blob, label_blob, hidden_size, label_num, initializer_range, scope_name='classification'): with flow.scope.namespace(scope_name): output_weight_blob = flow.get_variable( name="output_weights", shape=[label_num, hidden_size], dtype=input_blob.dtype, # initializer=bert_util.CreateInitializer(initializer_range), initializer=flow.random_normal_initializer( mean=0.0, stddev=initializer_range, seed=None, dtype=None)) output_bias_blob = flow.get_variable( name="output_bias", shape=[label_num], dtype=input_blob.dtype, initializer=flow.constant_initializer(0.0), ) logit_blob = flow.matmul(input_blob, output_weight_blob, transpose_b=True) logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob) pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits( logits=logit_blob, labels=label_blob) loss = pre_example_loss return loss, pre_example_loss, logit_blob
def row_parallel_linear( name, x, output_size, weight_initializer, bias_initializer=flow.constant_initializer(0.0), weight_parallel_dist=distribute.get_row_linear_weight_parallel_dist(), bias_parallel_dist=distribute.get_row_linear_bias_parallel_dist(), dropout_rate=0.1, bias_dropout_fusion=True, ): w, b = get_linear_params( name, x.shape[-1], output_size, x.dtype, weight_initializer=weight_initializer, bias_initializer=bias_initializer, weight_parallel_dist=weight_parallel_dist, bias_parallel_dist=bias_parallel_dist, ) # 2d sbp sig: [S(0), S(1)] x [B, S(0)] -> [S(0), P] -> [S(0), B] # data grad 2d sbp sig: [S(0), B] x [B, S(1)](transposed) -> [S(0), S(1)] x = flow.matmul(x, w) x = distribute.forward_p2b_parallel_cast(x) if bias_dropout_fusion: x = flow.nn.fused_bias_add_dropout(x, b, data_format="NHC", rate=dropout_rate) else: x = flow.nn.bias_add(x, b, data_format="NHC") x = flow.nn.dropout(x, rate=dropout_rate) return x
def _AddNextSentenceOutput(input_blob, label_blob, hidden_size, initializer_range): with flow.scope.namespace("cls-seq_relationship"): output_weight_blob = flow.get_variable( name="output_weights", shape=[2, hidden_size], dtype=input_blob.dtype, model_name="weight", initializer=bert_util.CreateInitializer(initializer_range), ) output_bias_blob = flow.get_variable( name="output_bias", shape=[2], dtype=input_blob.dtype, model_name="bias", initializer=flow.constant_initializer(0.0), ) logit_blob = flow.matmul(input_blob, output_weight_blob, transpose_b=True) logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob) pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits( logits=logit_blob, labels=label_blob) loss = pre_example_loss return (loss, pre_example_loss, logit_blob)
def get_linear_params( name, input_size, output_size, dtype, weight_initializer=flow.random_normal_initializer(stddev=0.02), bias_initializer=flow.constant_initializer(0.0), weight_parallel_dist=None, bias_parallel_dist=None, ): with flow.scope.namespace(name): weight = flow.get_variable( name="weight", shape=(input_size, output_size), dtype=dtype, initializer=weight_initializer, nd_sbp=weight_parallel_dist, ) bias = flow.get_variable( name="bias", shape=(output_size, ), dtype=dtype, initializer=bias_initializer, nd_sbp=bias_parallel_dist, ) return weight, bias
def col_parallel_linear( name, x, output_size, weight_initializer, bias_initializer=flow.constant_initializer(0.0), weight_parallel_dist=distribute.get_col_linear_weight_parallel_dist(), bias_parallel_dist=distribute.get_col_linear_bias_parallel_dist(), need_gelu=False, bias_gelu_fusion=True, ): w, b = get_linear_params( name, x.shape[-1], output_size, x.dtype, weight_initializer=weight_initializer, bias_initializer=bias_initializer, weight_parallel_dist=weight_parallel_dist, bias_parallel_dist=bias_parallel_dist, ) # 2d sbp sig: [S(0), B] x [B, S(1)] -> [S(0), S(1)] # data grad 2d sbp sig: [S(0), S(1)] x [B, S(0)](transposed) -> [S(0), P] -> [S(0), B] x = distribute.backward_p2b_parallel_cast(x) x = flow.matmul(x, w) if need_gelu: if bias_gelu_fusion: x = flow.nn.fused_bias_add_gelu(x, b, data_format="NHC") else: x = flow.nn.bias_add(x, b, data_format="NHC") x = flow.math.gelu(x) else: x = flow.nn.bias_add(x, b, data_format="NHC") return x
def expandJob( of_input: tp.Numpy.Placeholder(shape=input.shape, dtype=flow.float32), multipler: tp.Numpy.Placeholder(shape=gout.shape, dtype=flow.float32, batch_axis=diff), ) -> tp.Numpy: with flow.scope.placement(device_type, "0:0"): v = flow.get_variable( shape=of_input.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), name="v", ) input_x = v + of_input flow.watch_diff(input_x, assert_prediction_grad) x_fp32 = flow.cast(input_x, flow.float32) x_fp16 = flow.cast(input_x, dtype=flow.float16) y_fp16 = flow.expand(x_fp16, expand_dim) y_fp32 = flow.cast(y_fp16, dtype=flow.float32) with flow.scope.placement(device_type, "0:0"): flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.001]), momentum=0).minimize(y_fp32 * multipler) return y_fp32
def IdentityLoss(name): w = flow.get_variable(name, (10, ), initializer=flow.constant_initializer(100)) y = flow.math.reduce_sum(w) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler([], [5]), momentum=0).minimize(y) return y
def fetch_job(): with flow.scope.placement(device_type, "0:0"): good_step_counter = flow.get_variable( name="good_step_counter", shape=(1, ), dtype=flow.int64, initializer=flow.constant_initializer( op_param["good_step_counter_value"], dtype=flow.int64), ) loss_scale = flow.get_variable( name="loss_scale", shape=(1, ), dtype=flow.float, initializer=flow.constant_initializer( op_param["loss_scale_value"], dtype=flow.float), ) return (good_step_counter, loss_scale)
def relu_fn(): var = flow.get_variable( name="var", shape=value.shape, dtype=dtype, initializer=flow.constant_initializer(0), ) return flow.nn.relu(var)
def get_var(): return flow.get_variable( name="var", shape=var_shape, dtype=flow_dtype, initializer=flow.constant_initializer(0, dtype=flow_dtype), distribute=flow.distribute.split(split_axis), )
def assign_fn(value_def: oft.Numpy.Placeholder(value.shape, dtype=dtype)): var = flow.get_variable( name="var", shape=value.shape, dtype=dtype, initializer=flow.constant_initializer(0), ) assign(var, value_def)
def _prelu(inputs, name=None): return flow.layers.prelu( inputs, alpha_initializer=flow.constant_initializer(0.25), alpha_regularizer=_get_regularizer(), shared_axes=[2, 3], name=name, )
def dynamic_concat_job( input_0_def: oft.ListNumpy.Placeholder( shape=input_static_shape, dtype=flow.float ), input_1_def: oft.ListNumpy.Placeholder( shape=input_static_shape, dtype=flow.float ), ): var_0 = flow.get_variable( "Var0", shape=(1,), dtype=flow.float, initializer=flow.constant_initializer(value=1, dtype=flow.float), trainable=True, ) var_1 = flow.get_variable( "Var1", shape=(1,), dtype=flow.float, initializer=flow.constant_initializer(value=1, dtype=flow.float), trainable=True, ) var_0 = flow.cast_to_current_logical_view(var_0) var_1 = flow.cast_to_current_logical_view(var_1) input_0_def = flow.cast_to_current_logical_view(input_0_def) input_1_def = flow.cast_to_current_logical_view(input_1_def) if callable(watch_cb): flow.watch(var_0, watch_cb) flow.watch(var_1, watch_cb) flow.watch(flow.identity(input_0_def), watch_cb) flow.watch(flow.identity(input_1_def), watch_cb) var_0 = var_0 * input_0_def var_1 = var_1 * input_1_def if callable(watch_cb): flow.watch(var_0, watch_cb) flow.watch(var_1, watch_cb) result = flow.concat( [var_0, var_1], axis=axis, max_dim_size=input_static_shape[axis] ) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0 ).minimize(result) flow.watch_diff(var_0, make_watch_diff_cb(0)) flow.watch_diff(var_1, make_watch_diff_cb(1)) return result
def assign_fn(value_def: oft.Numpy.Placeholder(value.shape, dtype=dtype)): with flow.scope.placement(device_type, "1:0"): var = flow.get_variable( name="var", shape=value.shape, dtype=dtype, initializer=flow.constant_initializer(0), ) assign(var, value_def)
def dense( input, units, name, use_bias=False, trainable=True, reuse=False, const_init=False, ): name_ = name if reuse == False else name + "_reuse" in_shape = input.shape in_num_axes = len(in_shape) assert in_num_axes >= 2 inputs = flow.reshape(input, (-1, in_shape[-1])) if in_num_axes > 2 else input weight = flow.get_variable( name="{}-weight".format(name), shape=(units, inputs.shape[1]), dtype=inputs.dtype, initializer=flow.random_normal_initializer(stddev=0.02) if not const_init else flow.constant_initializer(0.002), trainable=trainable, model_name="weight", reuse=reuse, ) out = flow.matmul(a=inputs, b=weight, transpose_b=True, name=name_ + "matmul") if use_bias: bias = flow.get_variable( name="{}-bias".format(name), shape=(units, ), dtype=inputs.dtype, initializer=flow.random_normal_initializer() if not const_init else flow.constant_initializer(0.002), trainable=trainable, model_name="bias", reuse=reuse, ) out = flow.nn.bias_add(out, bias, name=name_ + "_bias_add") out = flow.reshape(out, in_shape[:-1] + (units, )) if in_num_axes > 2 else out return out
def relu_fn(): with flow.scope.placement(device_type, "1:0"): var = flow.get_variable( name="var", shape=value.shape, dtype=dtype, initializer=flow.constant_initializer(0), ) ret = flow.nn.relu(var) return ret