def main(): args = parse_args() grad_sizes = model_grad_sizes[args.model] ms.context.set_context(mode=ms.context.GRAPH_MODE, device_target=args.device) schedule = { 3: 2, 6: 3, 9: 4, 12: 1, } kfops.init(args.device) all_reduce = kfops.KungFuAllReduce() all_reduce_max = kfops.KungFuAllReduce(op=ReduceOp.MAX) resize = kfops.KungFuResize() xs = [ ms.Tensor(np.array([1.0] * size).astype(np.float32)) for size in grad_sizes ] step = 0 need_sync = True while True: if need_sync: step = sync_step(step, all_reduce_max) print('step: %d' % (step)) need_sync = False t0 = time.time() ys = [all_reduce(x) for x in xs] t1 = time.time() d = t1 - t0 if step in schedule: new_size = ms.Tensor(schedule[step], dtype=ms.uint32) print('step=%d, will resize to %d' % (step, schedule[step])) changed, detached = resize(new_size) print('changed %s, detached: %s' % (changed, detached)) if changed: need_sync = True if detached: break step += 1 if step > args.steps: break print('train loop finished') kfops.finalize(args.device)
def main(): args = parse_args() grad_sizes = model_grad_sizes[args.model] ms.context.set_context(mode=ms.context.GRAPH_MODE, device_target=args.device) if args.collective == 'mindspore': init() cluster_size = get_group_size() rank = get_rank() else: print('using kungfu collective') kfops.init(args.device) cluster_size = parse_kungfu_size() rank = parse_kungfu_port() - 10000 print('rank: %d, size: %d' % (rank, cluster_size)) if args.collective == 'mindspore': all_reduce = ms.ops.operations.AllReduce() elif args.collective == 'kungfu': all_reduce = kfops.KungFuAllReduce() else: raise RuntimeError('invalid collective') xs = [ ms.Tensor(np.array([1.0] * size).astype(np.float32)) for size in grad_sizes ] data_size = sum(grad_sizes) * 4 # 1 float is 4 bytes multiplier = 4 * (cluster_size - 1) Gi = 1024 * 1024 * 1024 def run_stage(name, steps): for i in range(steps): t0 = time.time() ys = [all_reduce(x) for x in xs] t1 = time.time() d = t1 - t0 rate = float(data_size) * multiplier / Gi / d if rank == 0: print('%s %d took %.3fms, data rate: %.3fGiB/s' % (name, i + 1, d * 1e3, rate)) run_stage('warmup', args.warmup_steps) run_stage('step', args.steps) if args.collective == 'kungfu': kfops.finalize(args.device)
def main(): args = parse_args() log_args(args) ms.context.set_context(mode=ms.context.GRAPH_MODE, device_target=args.device, save_graphs=False) kfops.init(args.device) all_reduce = kfops.KungFuAllReduce() x = ms.Tensor(np.array([1.0, 2.0, 3.0]).astype(np.float32)) print(x) y = all_reduce(x) print(y) kfops.finalize(args.device)
init() # GPU target else: init() context.set_auto_parallel_context( device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) if args_opt.net == "resnet50": context.set_auto_parallel_context( all_reduce_fusion_config=[85, 160]) ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str( get_rank()) + "/" if args_opt.run_kungfu: kfops.init(args_opt.device_target) rank = kfops.kungfu_current_rank() size = kfops.kungfu_current_cluster_size() print('kungfu rank=%d, size=%d' % (rank, size)) if args_opt.elastic: version = os.getenv('KUNGFU_INIT_CLUSTER_VERSION') ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str( rank) + '@' + version + "/" else: ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str( rank) + "/" # create dataset dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=100,