예제 #1
0
def main():
    args = parse_args()
    grad_sizes = model_grad_sizes[args.model]

    ms.context.set_context(mode=ms.context.GRAPH_MODE,
                           device_target=args.device)

    schedule = {
        3: 2,
        6: 3,
        9: 4,
        12: 1,
    }

    kfops.init(args.device)

    all_reduce = kfops.KungFuAllReduce()
    all_reduce_max = kfops.KungFuAllReduce(op=ReduceOp.MAX)
    resize = kfops.KungFuResize()

    xs = [
        ms.Tensor(np.array([1.0] * size).astype(np.float32))
        for size in grad_sizes
    ]

    step = 0
    need_sync = True
    while True:
        if need_sync:
            step = sync_step(step, all_reduce_max)
            print('step: %d' % (step))
            need_sync = False
        t0 = time.time()
        ys = [all_reduce(x) for x in xs]
        t1 = time.time()
        d = t1 - t0

        if step in schedule:
            new_size = ms.Tensor(schedule[step], dtype=ms.uint32)
            print('step=%d, will resize to %d' % (step, schedule[step]))
            changed, detached = resize(new_size)
            print('changed %s, detached: %s' % (changed, detached))
            if changed:
                need_sync = True
            if detached:
                break

        step += 1
        if step > args.steps:
            break
    print('train loop finished')
    kfops.finalize(args.device)
예제 #2
0
def train(args):
    with kfops.KungFuContext(device=args.device):
        all_reduce = kfops.KungFuAllReduce()
        x = ms.Tensor(np.array([1.0, 2.0, 3.0]).astype(np.float32))
        print(x)
        y = all_reduce(x)
        print(y)
예제 #3
0
    def __init__(self, *args, **kwargs):
        super(KungFuMomentum, self).__init__(*args, **kwargs)
        self.map_ = ms.ops.composite.Map()
        self.all_reduce = kfops.KungFuAllReduce()

        self.dbg_log_tensor = False
        self.log_tensor = kfops.KungFuLogTensor()
예제 #4
0
def main():
    args = parse_args()
    ms.context.set_context(mode=ms.context.GRAPH_MODE,
                           device_target=args.device)

    with kfops.KungFuContext(device=args.device):
        all_reduce = kfops.KungFuAllReduce()
        x = ms.Tensor(np.array([1.0, 2.0, 3.0]).astype(np.float32))
        print(x)
        y = all_reduce(x)
        print(y)
예제 #5
0
def main():
    args = parse_args()
    grad_sizes = model_grad_sizes[args.model]

    ms.context.set_context(mode=ms.context.GRAPH_MODE,
                           device_target=args.device)

    if args.collective == 'mindspore':
        init()
        cluster_size = get_group_size()
        rank = get_rank()
    else:
        print('using kungfu collective')
        kfops.init(args.device)
        cluster_size = parse_kungfu_size()
        rank = parse_kungfu_port() - 10000

    print('rank: %d, size: %d' % (rank, cluster_size))

    if args.collective == 'mindspore':
        all_reduce = ms.ops.operations.AllReduce()
    elif args.collective == 'kungfu':
        all_reduce = kfops.KungFuAllReduce()
    else:
        raise RuntimeError('invalid collective')

    xs = [
        ms.Tensor(np.array([1.0] * size).astype(np.float32))
        for size in grad_sizes
    ]

    data_size = sum(grad_sizes) * 4  # 1 float is 4 bytes
    multiplier = 4 * (cluster_size - 1)
    Gi = 1024 * 1024 * 1024

    def run_stage(name, steps):
        for i in range(steps):
            t0 = time.time()
            ys = [all_reduce(x) for x in xs]
            t1 = time.time()
            d = t1 - t0
            rate = float(data_size) * multiplier / Gi / d
            if rank == 0:
                print('%s %d took %.3fms, data rate: %.3fGiB/s' %
                      (name, i + 1, d * 1e3, rate))

    run_stage('warmup', args.warmup_steps)
    run_stage('step', args.steps)

    if args.collective == 'kungfu':
        kfops.finalize(args.device)
예제 #6
0
def main():
    args = parse_args()
    log_args(args)
    ms.context.set_context(mode=ms.context.GRAPH_MODE,
                           device_target=args.device,
                           save_graphs=False)

    kfops.init(args.device)

    all_reduce = kfops.KungFuAllReduce()

    x = ms.Tensor(np.array([1.0, 2.0, 3.0]).astype(np.float32))
    print(x)
    y = all_reduce(x)
    print(y)

    kfops.finalize(args.device)
예제 #7
0
    def __init__(self,
                 num_features,
                 eps=1e-5,
                 momentum=0.9,
                 affine=True,
                 gamma_init="ones",
                 beta_init="zeros",
                 moving_mean_init="zeros",
                 moving_var_init="ones",
                 input_dims="2d",
                 data_format="NCHW"):
        super().__init__()
        validator.check_value_type('num_features', num_features, [int], self.cls_name)
        if num_features < 1:
            raise ValueError("num_features must be at least 1")
        self.num_features = num_features
        if momentum < 0 or momentum > 1:
            error_msg = "momentum should be a number in range [0, 1], but got {}".format(momentum)
            raise ValueError(error_msg)
        self.momentum = 1.0 - momentum
        self.input_dims = input_dims
        self.format = validator.check_string(data_format, ['NCHW', 'NHWC'], 'format', self.cls_name)
        if ms.context.get_context("device_target") != "GPU" and self.format == "NHWC":
            raise ValueError("NHWC format only support in GPU target.")
        self.eps = eps
        self.moving_mean = ms.Parameter(initializer(
            moving_mean_init, num_features), name="mean", requires_grad=False)
        self.moving_variance = ms.Parameter(initializer(
            moving_var_init, num_features), name="variance", requires_grad=False)
        self.gamma = ms.Parameter(initializer(
            gamma_init, num_features), name="gamma", requires_grad=affine)
        self.beta = ms.Parameter(initializer(
            beta_init, num_features), name="beta", requires_grad=affine)

        #  self._cluster_size_op = kfops.KungFuClusterSize()
        self._all_reduce_op = kfops.KungFuAllReduce()
        self._square_op = ms.ops.Square()
        self._sqrt_op = ms.ops.Sqrt()

        # HACK
        self._cluster_size_op = kfops.KungFuClusterSizeInput()
        self._cluster_size_input = ms.Tensor(np.ones((1,), dtype=np.int32))