def train_resnet(DIST=True, graph=True, sequential=False, verbosity=0): # Define the hypermeters good for the train_resnet niters = 100 batch_size = 32 sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5) IMG_SIZE = 224 # For distributed training, sequential has better throughput in the current version if DIST == True: sgd = opt.DistOpt(sgd) world_size = sgd.world_size local_rank = sgd.local_rank global_rank = sgd.global_rank sequential = True else: local_rank = 0 world_size = 1 global_rank = 0 sequential = False dev = device.create_cuda_gpu_on(local_rank) tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev) ty = tensor.Tensor((batch_size,), dev, tensor.int32) x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.random.randint(0, 1000, batch_size, dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) dev.SetVerbosity(verbosity) dev.SetSkipIteration(5) # construct the model from model import resnet model = resnet.resnet50(num_channels=3, num_classes=1000) model.train() model.set_optimizer(sgd) model.compile([tx], is_train=True, use_graph=graph, sequential=sequential) # train model dev.Sync() start = time.time() with trange(niters) as t: for _ in t: model(tx, ty, dist_option='fp32', spars=None) dev.Sync() end = time.time() titer = (end - start) / float(niters) throughput = float(niters * batch_size * world_size) / (end - start) if global_rank == 0: print("Throughput = {} per second".format(throughput), flush=True) print("TotalTime={}".format(end - start), flush=True) print("Total={}".format(titer), flush=True) dev.PrintTimeProfiling()
def run(args, local_rank, world_size, nccl_id): sgd = opt.SGD(lr=args.lr, momentum=0.9, weight_decay=1e-5) sgd = opt.DistOpt(sgd, nccl_id=nccl_id, local_rank=local_rank, world_size=world_size) train.run(sgd.global_rank, sgd.world_size, sgd.local_rank, args.max_epoch, args.batch_size, args.model, args.data, sgd, args.graph, args.dist_option, args.spars)
def run(args, local_rank, world_size, nccl_id): sgd = opt.SGD(lr=args.lr, momentum=0.9, weight_decay=1e-5, dtype=singa_dtype[args.precision]) sgd = opt.DistOpt(sgd, nccl_id=nccl_id, local_rank=local_rank, world_size=world_size) train_cnn.run(sgd.global_rank, sgd.world_size, sgd.local_rank, args.max_epoch, args.batch_size, args.model, args.data, sgd, args.graph, args.verbosity, args.dist_option, args.spars, args.precision)
# # the code is modified from # https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py from singa import autograd from singa import tensor from singa import device from singa import opt import numpy as np from tqdm import trange if __name__ == "__main__": sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5) sgd = opt.DistOpt(sgd) from resnet import resnet50 model = resnet50() if (sgd.rank_in_global == 0): print("Start intialization...........", flush=True) dev = device.create_cuda_gpu_on(sgd.rank_in_local) niters = 100 batch_size = 32 IMG_SIZE = 224 tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev) ty = tensor.Tensor((batch_size, ), dev, tensor.int32) autograd.training = True
def train_mnist_cnn(DIST=False, local_rank=None, world_size=None, nccl_id=None, spars=0, topK=False, corr=True): # Define the hypermeters good for the mnist_cnn max_epoch = 10 batch_size = 64 sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5) # Prepare training and valadiation data train_x, train_y, test_x, test_y = load_dataset() IMG_SIZE = 28 num_classes = 10 train_y = to_categorical(train_y, num_classes) test_y = to_categorical(test_y, num_classes) # Normalization train_x = train_x / 255 test_x = test_x / 255 if DIST: # For Distributed GPU Training sgd = opt.DistOpt(sgd, nccl_id=nccl_id, local_rank=local_rank, world_size=world_size) dev = device.create_cuda_gpu_on(sgd.local_rank) # Dataset partition for distributed training train_x, train_y = data_partition(train_x, train_y, sgd.global_rank, sgd.world_size) test_x, test_y = data_partition(test_x, test_y, sgd.global_rank, sgd.world_size) world_size = sgd.world_size else: # For Single GPU dev = device.create_cuda_gpu() world_size = 1 # create model model = CNN() tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32) ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32) num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) if DIST: #Sychronize the initial parameters autograd.training = True x = np.random.randn(batch_size, 1, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) for p, g in autograd.backward(loss): synchronize(p, sgd) # Training and Evaulation Loop for epoch in range(max_epoch): start_time = time.time() np.random.shuffle(idx) if ((DIST == False) or (sgd.global_rank == 0)): print('Starting Epoch %d:' % (epoch)) # Training Phase autograd.training = True train_correct = np.zeros(shape=[1], dtype=np.float32) test_correct = np.zeros(shape=[1], dtype=np.float32) train_loss = np.zeros(shape=[1], dtype=np.float32) for b in range(num_train_batch): x = train_x[idx[b * batch_size:(b + 1) * batch_size]] x = augmentation(x, batch_size) y = train_y[idx[b * batch_size:(b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) train_correct += accuracy(tensor.to_numpy(out), y) train_loss += tensor.to_numpy(loss)[0] if DIST: if (spars == 0): sgd.backward_and_update(loss, threshold=50000) else: sgd.backward_and_sparse_update(loss, spars=spars, topK=topK, corr=corr) else: sgd.backward_and_update(loss) if DIST: # Reduce the Evaluation Accuracy and Loss from Multiple Devices reducer = tensor.Tensor((1,), dev, tensor.float32) train_correct = reduce_variable(train_correct, sgd, reducer) train_loss = reduce_variable(train_loss, sgd, reducer) # Output the Training Loss and Accuracy if ((DIST == False) or (sgd.global_rank == 0)): print('Training loss = %f, training accuracy = %f' % (train_loss, train_correct / (num_train_batch * batch_size * world_size)), flush=True) # Evaluation Phase autograd.training = False for b in range(num_test_batch): x = test_x[b * batch_size:(b + 1) * batch_size] y = test_y[b * batch_size:(b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model.forward(tx) test_correct += accuracy(tensor.to_numpy(out_test), y) if DIST: # Reduce the Evaulation Accuracy from Multiple Devices test_correct = reduce_variable(test_correct, sgd, reducer) # Output the Evaluation Accuracy if ((DIST == False) or (sgd.global_rank == 0)): print('Evaluation accuracy = %f, Elapsed Time = %fs' % (test_correct / (num_test_batch * batch_size * world_size), time.time() - start_time), flush=True)
def train_mnist_cnn(sgd, max_epoch, batch_size, DIST=False, data_partition=None, gpu_num=None, gpu_per_node=None, nccl_id=None): # Prepare training and valadiation data train_x, train_y, test_x, test_y = load_dataset() IMG_SIZE = 28 num_classes = 10 train_y = to_categorical(train_y, num_classes) test_y = to_categorical(test_y, num_classes) # Normalization train_x = train_x / 255 test_x = test_x / 255 if DIST: # For Distributed GPU Training sgd = opt.DistOpt(sgd, nccl_id=nccl_id, gpu_num=gpu_num, gpu_per_node=gpu_per_node) dev = device.create_cuda_gpu_on(sgd.rank_in_local) # Dataset partition for distributed training train_x, train_y = data_partition(train_x, train_y, sgd.rank_in_global, sgd.world_size) test_x, test_y = data_partition(test_x, test_y, sgd.rank_in_global, sgd.world_size) world_size = sgd.world_size else: # For Single GPU dev = device.create_cuda_gpu() world_size = 1 # create model model = CNN() tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32) ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32) num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) if DIST: #Sychronize the initial parameters autograd.training = True x = np.random.randn(batch_size, 1, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) for p, g in autograd.backward(loss): sychronize(p, sgd) # Training and Evaulation Loop for epoch in range(max_epoch): start_time = time.time() np.random.shuffle(idx) if ((DIST == False) or (sgd.rank_in_global == 0)): print('Starting Epoch %d:' % (epoch)) # Training Phase autograd.training = True train_correct = np.zeros(shape=[1], dtype=np.float32) test_correct = np.zeros(shape=[1], dtype=np.float32) train_loss = np.zeros(shape=[1], dtype=np.float32) for b in range(num_train_batch): x = train_x[idx[b * batch_size:(b + 1) * batch_size]] x = augmentation(x, batch_size) y = train_y[idx[b * batch_size:(b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) train_correct += accuracy(tensor.to_numpy(out), y) train_loss += tensor.to_numpy(loss)[0] plist = [] for p, g in autograd.backward(loss): if DIST: sgd.all_reduce(g) plist.append((p, g)) if DIST: sgd.wait() for p, g in plist: sgd.update(p, g) if DIST: # Reduce the Evaluation Accuracy and Loss from Multiple Devices reducer = tensor.Tensor((1, ), dev, tensor.float32) train_correct = reduce_variable(train_correct, sgd, reducer) train_loss = reduce_variable(train_loss, sgd, reducer) # Output the Training Loss and Accuracy if ((DIST == False) or (sgd.rank_in_global == 0)): print('Training loss = %f, training accuracy = %f' % (train_loss, train_correct / (num_train_batch * batch_size * world_size)), flush=True) # Evaluation Phase autograd.training = False for b in range(num_test_batch): x = test_x[b * batch_size:(b + 1) * batch_size] y = test_y[b * batch_size:(b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model.forward(tx) test_correct += accuracy(tensor.to_numpy(out_test), y) if DIST: # Reduce the Evaulation Accuracy from Multiple Devices test_correct = reduce_variable(test_correct, sgd, reducer) # Output the Evaluation Accuracy if ((DIST == False) or (sgd.rank_in_global == 0)): print('Evaluation accuracy = %f, Elapsed Time = %fs' % (test_correct / (num_test_batch * batch_size * world_size), time.time() - start_time), flush=True)
def train_cifar10(sgd, max_epoch, batch_size, DIST=False, data_partition=None, gpu_num=None, gpu_per_node=None, nccl_id=None, partial_update=False): train_x, train_y = load_train_data() test_x, test_y = load_test_data() train_x, test_x = normalize_for_resnet(train_x, test_x) IMG_SIZE = 224 num_classes=10 if DIST: # For Distributed GPU Training sgd = opt.DistOpt(sgd, nccl_id=nccl_id, gpu_num=gpu_num, gpu_per_node=gpu_per_node) dev = device.create_cuda_gpu_on(sgd.rank_in_local) # Dataset partition for distributed training train_x, train_y = data_partition(train_x, train_y, sgd.rank_in_global, sgd.world_size) test_x, test_y = data_partition(test_x, test_y, sgd.rank_in_global, sgd.world_size) world_size = sgd.world_size else: # For Single GPU dev = device.create_cuda_gpu() world_size = 1 from resnet import resnet50 model = resnet50(num_classes=num_classes) tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev, tensor.float32) ty = tensor.Tensor((batch_size,), dev, tensor.int32) num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) if DIST: #Sychronize the initial parameters autograd.training = True x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.zeros( shape=(batch_size,), dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model(tx) loss = autograd.softmax_cross_entropy(out, ty) param = [] for p, _ in autograd.backward(loss): sychronize(p, sgd) param.append(p) for epoch in range(max_epoch): start_time = time.time() np.random.shuffle(idx) if ((DIST == False) or (sgd.rank_in_global == 0)): print('Starting Epoch %d:' % (epoch)) #Training Phase autograd.training = True train_correct = np.zeros(shape=[1],dtype=np.float32) test_correct = np.zeros(shape=[1],dtype=np.float32) train_loss = np.zeros(shape=[1],dtype=np.float32) for b in range(num_train_batch): x = train_x[idx[b * batch_size: (b + 1) * batch_size]] x = augmentation(x, batch_size) x = resize_dataset(x,IMG_SIZE) y = train_y[idx[b * batch_size: (b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model(tx) loss = autograd.softmax_cross_entropy(out, ty) train_correct += accuracy(tensor.to_numpy(out), to_categorical(y, num_classes)).astype(np.float32) train_loss += tensor.to_numpy(loss)[0] if not partial_update: sgd.backward_and_update(loss) else: sgd.backward_and_partial_update(loss) if DIST: # Reduce the Evaluation Accuracy and Loss from Multiple Devices reducer = tensor.Tensor((1,), dev, tensor.float32) train_correct = reduce_variable(train_correct, sgd, reducer) train_loss = reduce_variable(train_loss, sgd, reducer) # Output the Training Loss and Accuracy if ((DIST == False) or (sgd.rank_in_global == 0)): print('Training loss = %f, training accuracy = %f' % (train_loss, train_correct / (num_train_batch*batch_size*world_size)), flush=True) if partial_update: # sychronize parameters before evaluation phase for p in param: sychronize(p, sgd) #Evaulation Phase autograd.training = False for b in range(num_test_batch): x = test_x[b * batch_size: (b + 1) * batch_size] x = resize_dataset(x,IMG_SIZE) y = test_y[b * batch_size: (b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model(tx) test_correct += accuracy(tensor.to_numpy(out_test), to_categorical(y, num_classes)) if DIST: # Reduce the Evaulation Accuracy from Multiple Devices test_correct = reduce_variable(test_correct, sgd, reducer) # Output the Evaluation Accuracy if ((DIST == False) or (sgd.rank_in_global == 0)): print('Evaluation accuracy = %f, Elapsed Time = %fs' % (test_correct / (num_test_batch*batch_size*world_size), time.time() - start_time ), flush=True)
def train_resnet(DIST='singa', graph=True, sequential=False): # Define the hypermeters good for the train_resnet niters = 100 batch_size = 32 sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5) IMG_SIZE = 224 # For distributed training, sequential has better throughput in the current version if DIST=='singa': sgd = opt.DistOpt(sgd) world_size = sgd.world_size local_rank = sgd.local_rank global_rank = sgd.global_rank sequential = True else: kv_type = 'dist_sync' #set synchronization mode kv = singa_kvstore.create_kvstore(kv_type,'sgd',learning_rate=0.005) global_rank = kv.rank world_size = kv.num_workers sequential = True dev = device.create_cuda_gpu_on(kv.rank) tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev) ty = tensor.Tensor((batch_size,), dev, tensor.int32) x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.random.randint(0, 1000, batch_size, dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) # construct the model from model import resnet model = resnet.resnet50(num_channels=3, num_classes=1000) model.train() model.on_device(dev) model.set_optimizer(sgd) model.graph(graph, sequential) # train model if DIST=='singa': dev.Sync() compute_time = 0.0 syn_time = 0.0 start = time.time() with trange(niters) as t: for _ in t: out = model(tx) compute_start = time.time() loss = model.loss(out, ty) compute_time += time.time()-compute_start if DIST=='singa': syn_start = time.time() model.optim(loss, dist_option='fp32', spars=None) syn_time += time.time()-syn_start else: #autograd.training = True syn_start = time.time() singa_kvstore.backward_and_update(kv,loss) syn_time += time.time()-syn_start if DIST=='singa': dev.Sync() end = time.time() compute_time = compute_time /float(niters) syn_time = syn_time/ float(niters) titer = (end - start) / float(niters) throughput = float(niters * batch_size * world_size) / (end - start) if global_rank == 0: print("compute_time = {}".format(compute_time),flush=True) print("syn_time = {}".format(syn_time),flush=True) print("Throughput = {} per second".format(throughput), flush=True) print("TotalTime={}".format(end - start), flush=True) print("Total={}".format(titer), flush=True)