예제 #1
0
def main():
    args = parse_args()
    grad_sizes = model_grad_sizes[args.model]

    ms.context.set_context(mode=ms.context.GRAPH_MODE,
                           device_target=args.device)

    schedule = {
        3: 2,
        6: 3,
        9: 4,
        12: 1,
    }

    kfops.init(args.device)

    all_reduce = kfops.KungFuAllReduce()
    all_reduce_max = kfops.KungFuAllReduce(op=ReduceOp.MAX)
    resize = kfops.KungFuResize()

    xs = [
        ms.Tensor(np.array([1.0] * size).astype(np.float32))
        for size in grad_sizes
    ]

    step = 0
    need_sync = True
    while True:
        if need_sync:
            step = sync_step(step, all_reduce_max)
            print('step: %d' % (step))
            need_sync = False
        t0 = time.time()
        ys = [all_reduce(x) for x in xs]
        t1 = time.time()
        d = t1 - t0

        if step in schedule:
            new_size = ms.Tensor(schedule[step], dtype=ms.uint32)
            print('step=%d, will resize to %d' % (step, schedule[step]))
            changed, detached = resize(new_size)
            print('changed %s, detached: %s' % (changed, detached))
            if changed:
                need_sync = True
            if detached:
                break

        step += 1
        if step > args.steps:
            break
    print('train loop finished')
    kfops.finalize(args.device)
예제 #2
0
def main():
    args = parse_args()
    grad_sizes = model_grad_sizes[args.model]

    ms.context.set_context(mode=ms.context.GRAPH_MODE,
                           device_target=args.device)

    if args.collective == 'mindspore':
        init()
        cluster_size = get_group_size()
        rank = get_rank()
    else:
        print('using kungfu collective')
        kfops.init(args.device)
        cluster_size = parse_kungfu_size()
        rank = parse_kungfu_port() - 10000

    print('rank: %d, size: %d' % (rank, cluster_size))

    if args.collective == 'mindspore':
        all_reduce = ms.ops.operations.AllReduce()
    elif args.collective == 'kungfu':
        all_reduce = kfops.KungFuAllReduce()
    else:
        raise RuntimeError('invalid collective')

    xs = [
        ms.Tensor(np.array([1.0] * size).astype(np.float32))
        for size in grad_sizes
    ]

    data_size = sum(grad_sizes) * 4  # 1 float is 4 bytes
    multiplier = 4 * (cluster_size - 1)
    Gi = 1024 * 1024 * 1024

    def run_stage(name, steps):
        for i in range(steps):
            t0 = time.time()
            ys = [all_reduce(x) for x in xs]
            t1 = time.time()
            d = t1 - t0
            rate = float(data_size) * multiplier / Gi / d
            if rank == 0:
                print('%s %d took %.3fms, data rate: %.3fGiB/s' %
                      (name, i + 1, d * 1e3, rate))

    run_stage('warmup', args.warmup_steps)
    run_stage('step', args.steps)

    if args.collective == 'kungfu':
        kfops.finalize(args.device)
예제 #3
0
def main():
    args = parse_args()
    log_args(args)
    ms.context.set_context(mode=ms.context.GRAPH_MODE,
                           device_target=args.device,
                           save_graphs=False)

    kfops.init(args.device)

    all_reduce = kfops.KungFuAllReduce()

    x = ms.Tensor(np.array([1.0, 2.0, 3.0]).astype(np.float32))
    print(x)
    y = all_reduce(x)
    print(y)

    kfops.finalize(args.device)
예제 #4
0
        cb += [kungfu_elastic_callback]
        dataset_sink_mode = False
        print('enabled elastic')

        # from src.debug import DebugStopHook
        # cb += [DebugStopHook()]
    cb += [LogStepCallback()]

    # train model
    if args_opt.net == "se-resnet50":
        config.epoch_size = config.train_epoch_size
    print('training...')
    print('dataset.get_dataset_size(): %d' % (dataset.get_dataset_size()))
    print('%d callbacks' % (len(cb)))
    print('epoch_size: %d, pretrain_epoch_size: %d' %
          (config.epoch_size, config.pretrain_epoch_size))
    for c in cb:
        print('%s' % (c))
    train_epoch = config.epoch_size - config.pretrain_epoch_size
    print('dataset_sink_mode: %s' % (dataset_sink_mode))
    # sink_size = dataset.get_dataset_size()
    sink_size = 1
    model.train(train_epoch,
                dataset,
                callbacks=cb,
                sink_size=sink_size,
                dataset_sink_mode=dataset_sink_mode)
    print('train finished.')
    if args_opt.run_kungfu:
        kfops.finalize(args_opt.device_target)