Python parallel_cast示例，oneflow.parallel_cast Python示例

示例#1

0

显示文件

文件： test_parallel_cast.py 项目： zjureel/oneflow

 def test_fn(
     a: flow.typing.Numpy.Placeholder(a_shape),
     b: flow.typing.Numpy.Placeholder(b_shape),
     c: flow.typing.Numpy.Placeholder(c_shape),
 ) -> flow.typing.Numpy:
     # print(f"a.split_axis: {a.split_axis}")
     # print(f"b.split_axis: {b.split_axis}")
     # print(f"c.split_axis: {c.split_axis}")
     var_a = flow.get_variable(
         name="var_a",
         shape=a_shape,
         dtype=flow.float32,
         initializer=flow.ones_initializer(),
         distribute=flow.distribute.split(1),
     )
     # S0 -> S1
     a = flow.parallel_cast(a, distribute=flow.distribute.split(1))
     a = var_a * a
     out = flow.matmul(a, b)
     # P -> B
     out = flow.parallel_cast(
         out,
         distribute=flow.distribute.broadcast(),
         gradient_distribute=flow.distribute.broadcast(),
     )
     # S0 -> B
     c = flow.parallel_cast(c, distribute=flow.distribute.broadcast())
     out = flow.nn.bias_add(out, c)
     lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001])
     flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(out)
     return out

示例#2

0

显示文件

文件： wdl_train_eval_test.py 项目： yayeoCddy/OneFlow-Benchmark

def _model(dense_fields, wide_sparse_fields, deep_sparse_fields):
    wide_sparse_fields = flow.parallel_cast(
        wide_sparse_fields, distribute=flow.distribute.broadcast())
    wide_embedding_table = flow.get_variable(
        name='wide_embedding',
        shape=(FLAGS.wide_vocab_size, 1),
        initializer=flow.random_uniform_initializer(minval=-0.05, maxval=0.05),
        distribute=flow.distribute.split(0),
    )
    wide_embedding = flow.gather(params=wide_embedding_table,
                                 indices=wide_sparse_fields)
    wide_embedding = flow.reshape(wide_embedding,
                                  shape=(-1, wide_embedding.shape[-1] *
                                         wide_embedding.shape[-2]))
    wide_scores = flow.math.reduce_sum(wide_embedding, axis=[1], keepdims=True)
    wide_scores = flow.parallel_cast(
        wide_scores,
        distribute=flow.distribute.split(0),
        gradient_distribute=flow.distribute.broadcast())

    deep_sparse_fields = flow.parallel_cast(
        deep_sparse_fields, distribute=flow.distribute.broadcast())
    deep_embedding_table = flow.get_variable(
        name='deep_embedding',
        shape=(FLAGS.deep_vocab_size, FLAGS.deep_embedding_vec_size),
        initializer=flow.random_uniform_initializer(minval=-0.05, maxval=0.05),
        distribute=flow.distribute.split(1),
    )
    deep_embedding = flow.gather(params=deep_embedding_table,
                                 indices=deep_sparse_fields)
    deep_embedding = flow.parallel_cast(
        deep_embedding,
        distribute=flow.distribute.split(0),
        gradient_distribute=flow.distribute.split(2))
    deep_embedding = flow.reshape(deep_embedding,
                                  shape=(-1, deep_embedding.shape[-1] *
                                         deep_embedding.shape[-2]))
    deep_features = flow.concat([deep_embedding, dense_fields], axis=1)
    for idx, units in enumerate(DEEP_HIDDEN_UNITS):
        deep_features = flow.layers.dense(
            deep_features,
            units=units,
            kernel_initializer=flow.glorot_uniform_initializer(),
            bias_initializer=flow.constant_initializer(0.0),
            activation=flow.math.relu,
            name='fc' + str(idx + 1))
        deep_features = flow.nn.dropout(deep_features,
                                        rate=FLAGS.deep_dropout_rate)
    deep_scores = flow.layers.dense(
        deep_features,
        units=1,
        kernel_initializer=flow.glorot_uniform_initializer(),
        bias_initializer=flow.constant_initializer(0.0),
        name='fc' + str(len(DEEP_HIDDEN_UNITS) + 1))

    scores = wide_scores + deep_scores
    return scores

示例#3

0

显示文件

文件： train.py 项目： ZJLabDubhe/OneFlow-Benchmark

    def gpt2_func(x: flow.typing.Numpy.Placeholder(
        (args.batch_size, args.seq_len), dtype=flow.int64)):
        if x.split_axis == 0:
            x = flow.parallel_cast(x, distribute=flow.distribute.broadcast())

        outputs = {}
        gpt2 = GPT2(args, name="model")
        outputs = gpt2.forward(x)
        loss = gpt2.loss(x,
                         outputs["logits"],
                         parallel_loss=args.parallel_loss)
        outputs["loss"] = loss
        optimizer = util.make_optimizer(args)
        optimizer.minimize(loss)
        return {"loss": loss}

示例#4

0

显示文件

文件： remote_blob.py 项目： Sodu-Qinming/Oneflow

 def with_gradient_distribute(self, distribute):
     return oneflow.parallel_cast(self, gradient_distribute=distribute)

示例#5

0

显示文件

def insightface_train_job():
    if args.use_synthetic_data:
        (labels, images) = ofrecord_util.load_synthetic(args)
    else:
        labels, images = ofrecord_util.load_train_dataset(args)
    print("train batch data: ", images.shape)
    embedding = insightface(images)

    def _get_initializer():
        return flow.random_normal_initializer(mean=0.0, stddev=0.01)

    trainable = True
    if args.loss_type == "arc_loss":
        s = args.margin_s
        m = args.margin
        fc1 = flow.math.l2_normalize(input=embedding, axis=1, epsilon=1e-10)
        fc1 = flow.math.multiply(fc1, s)
        fc7 = flow.get_variable(
            name="fc7-weight",
            shape=(args.class_num, fc1.shape[1]),
            dtype=fc1.dtype,
            initializer=_get_initializer(),
            trainable=trainable,
            model_name="weight",
        )
        fc7 = flow.math.l2_normalize(input=fc7, axis=1, epsilon=1e-10)
        matmul = flow.matmul(a=fc1, b=fc7, transpose_b=True)
        labels_expand = flow.reshape(labels, (labels.shape[0], 1))
        zy = flow.gather(matmul, labels_expand, batch_dims=1)
        cos_t = flow.math.multiply(zy, 1 / s)
        cos_m = math.cos(m)
        sin_m = math.sin(m)
        mm = math.sin(math.pi - m) * m
        threshold = math.cos(math.pi - m)
        if args.easy_margin:
            cond = flow.math.relu(cos_t)
        else:
            cond_v = cos_t - threshold
            cond = flow.math.relu(cond_v)
        body = flow.math.square(cos_t)
        body = flow.math.multiply(body, -1.0)
        body = flow.math.add(1, body)
        sin_t = flow.math.sqrt(body)

        new_zy = flow.math.multiply(cos_t, cos_m)
        b = flow.math.multiply(sin_t, sin_m)
        b = flow.math.multiply(b, -1.0)
        new_zy = flow.math.add(new_zy, b)
        new_zy = flow.math.multiply(new_zy, s)
        if args.easy_margin:
            zy_keep = zy
        else:
            zy_keep = flow.math.add(zy, -s * mm)
        cond = flow.cast(cond, dtype=flow.int32)
        new_zy = flow.where(cond, new_zy, zy_keep)
        zy = flow.math.multiply(zy, -1.0)
        diff = flow.math.add(new_zy, zy)

        gt_one_hot = flow.one_hot(
            labels, depth=args.class_num, dtype=flow.float
        )
        body = flow.math.multiply(gt_one_hot, diff)
        fc7 = flow.math.add(matmul, body)
    elif args.loss_type == "margin_softmax":
        fc7_weight = flow.get_variable(
            name="fc7-weight",
            shape=(args.class_num, embedding.shape[1]),
            dtype=embedding.dtype,
            initializer=_get_initializer(),
            trainable=trainable,
            model_name="weight",
        )
        s = args.margin_s
        fc7_weight = flow.math.l2_normalize(
            input=fc7_weight, axis=1, epsilon=1e-10
        )
        fc1 = (
            flow.math.l2_normalize(input=embedding, axis=1, epsilon=1e-10) * s
        )
        fc7 = flow.matmul(a=fc1, b=fc7_weight, transpose_b=True)
        if args.loss_m1 != 1.0 or args.loss_m2 != 0.0 or args.loss_m3 != 0.0:
            if args.loss_m1 == 1.0 and args.loss_m2 == 0.0:
                s_m = s * args.loss_m3
                gt_one_hot = flow.one_hot(
                    labels,
                    depth=args.class_num,
                    on_value=s_m,
                    off_value=0.0,
                    dtype=flow.float,
                )
                fc7 = fc7 - gt_one_hot
            else:
                labels_expand = flow.reshape(labels, (labels.shape[0], 1))
                zy = flow.gather(fc7, labels_expand, batch_dims=1)
                cos_t = zy * (1 / s)
                t = flow.math.acos(cos_t)
                if args.loss_m1 != 1.0:
                    t = t * args.loss_m1
                if args.loss_m2 > 0.0:
                    t = t + args.loss_m2
                body = flow.math.cos(t)
                if args.loss_m3 > 0.0:
                    body = body - args.loss_m3
                new_zy = body * s
                diff = new_zy - zy
                gt_one_hot = flow.one_hot(
                    labels,
                    depth=args.class_num,
                    on_value=1.0,
                    off_value=0.0,
                    dtype=flow.float,
                )
                body = gt_one_hot * diff
                fc7 = fc7 + body
    elif args.loss_type == "softmax":
        if args.model_parallel:
            labels = labels.with_distribute(flow.distribute.broadcast())
            fc1_distribute = flow.distribute.broadcast()
            fc7_data_distribute = flow.distribute.split(1)
            fc7_model_distribute = flow.distribute.split(0)
        else:
            fc1_distribute = flow.distribute.split(0)
            fc7_data_distribute = flow.distribute.split(0)
            fc7_model_distribute = flow.distribute.broadcast()
        print("loss 0")
        fc7 = flow.layers.dense(
            inputs=embedding.with_distribute(fc1_distribute),
            units=args.class_num,
            activation=None,
            use_bias=False,
            kernel_initializer=_get_initializer(),
            bias_initializer=None,
            trainable=trainable,
            name=args.models_name,
            model_distribute=fc7_model_distribute,
        )
        fc7 = fc7.with_distribute(fc7_data_distribute)
    elif args.loss_type == "arc_loss_ms":
        labels = labels.with_distribute(flow.distribute.broadcast())
        fc7_model_distribute = flow.distribute.split(0)
        fc7_data_distribute = flow.distribute.split(1)
        fc7_weight = flow.get_variable(
            name="fc7-weight",
            shape=(args.class_num, embedding.shape[1]),
            dtype=embedding.dtype,
            initializer=_get_initializer(),
            trainable=trainable,
            model_name="weight",
            distribute=fc7_model_distribute,
        )
        s = args.margin_s
        fc7_weight = flow.math.l2_normalize(
            input=fc7_weight, axis=1, epsilon=1e-10
        )
        fc1 = (
            flow.math.l2_normalize(input=embedding, axis=1, epsilon=1e-10)
        )
        fc1 = flow.parallel_cast(fc1, distribute=flow.distribute.broadcast())
        fc7 = flow.matmul(a=fc1, b=fc7_weight, transpose_b=True) #s1
        fc7 = flow.arc_loss(fc7, labels, margin=args.loss_m2)*60
        fc7 = fc7.with_distribute(fc7_data_distribute)
    else:
        raise NotImplementedError

    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
        labels, fc7, name="softmax_loss"
    )
    
    lr_scheduler = flow.optimizer.PiecewiseScalingScheduler(args.base_lr, [100000, 140000, 160000], [0.1, 0.01, 0.001])
    flow.optimizer.SGD(lr_scheduler, momentum=0.9).minimize(loss)
    return loss