def elastic_example(args): data_dir = os.path.join(args.data_path, 'mnist', 'train') dataset = create_elastic_mnist( data_path=data_dir, batch_size=args.batch_size, ) total = dataset.get_dataset_size() print('total steps: %d when using batch size: %d' % ( total, args.batch_size, )) with kfops.KungFuContext(device=args.device): # state = State(0, 60000) # state.sync() # s = state.global_offset() # print('start with global off=%d' % (s)) # while not state.finished(): # pass it = enumerate(dataset) for i in range(min(args.max_step, total)): idx, (x, y) = next(it) print( 'data consumed: %d/%d %s%s %s%s' % (idx, total, x.dtype, x.shape, y.dtype, y.shape), file=sys.stderr, )
def train(args): with kfops.KungFuContext(device=args.device): all_reduce = kfops.KungFuAllReduce() x = ms.Tensor(np.array([1.0, 2.0, 3.0]).astype(np.float32)) print(x) y = all_reduce(x) print(y)
def main(): args = parse_args() log_args(args) if args.use_kungfu: with kfops.KungFuContext(device=args.device): log_duration(run, args) else: log_duration(run, args)
def main(): args = parse_args() log_args(args) ms.context.set_context(mode=ms.context.GRAPH_MODE, device_target=args.device, save_graphs=False) with kfops.KungFuContext(device='CPU'): # don't init kungFU NCCL kfops.kungfu_debug_nccl()
def main(): args = parse_args() ms.context.set_context(mode=ms.context.GRAPH_MODE, device_target=args.device) with kfops.KungFuContext(device=args.device): all_reduce = kfops.KungFuAllReduce() x = ms.Tensor(np.array([1.0, 2.0, 3.0]).astype(np.float32)) print(x) y = all_reduce(x) print(y)
def main(): args = parse_args() with kfops.KungFuContext(device=args.device): run(args)
download_dataset(args.data_dir) # define the loss function net_loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # create the network network = LeNet5() # define the optimizer net_opt = build_optimizer(args, network) config_ck = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=10) # save the network model and parameters for subsequence fine-tuning ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", config=config_ck) # group layers into an object with training and evaluation features model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) train_net(model, args.epoch_size, args.data_dir, args.repeat_size, ckpoint_cb, dataset_sink_mode) # TODO: test # test_net(network, model, args.data_dir) if __name__ == "__main__": args = parse_args() if args.use_kungfu: with kfops.KungFuContext(device=args.device): main(args) else: main(args)