예제 #1
0
def train_resnet(DIST=True, graph=True, sequential=False, verbosity=0):

    # Define the hypermeters good for the train_resnet
    niters = 100
    batch_size = 32
    sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5)

    IMG_SIZE = 224

    # For distributed training, sequential has better throughput in the current version
    if DIST == True:
        sgd = opt.DistOpt(sgd)
        world_size = sgd.world_size
        local_rank = sgd.local_rank
        global_rank = sgd.global_rank
        sequential = True
    else:
        local_rank = 0
        world_size = 1
        global_rank = 0
        sequential = False

    dev = device.create_cuda_gpu_on(local_rank)

    tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev)
    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
    x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
    y = np.random.randint(0, 1000, batch_size, dtype=np.int32)
    tx.copy_from_numpy(x)
    ty.copy_from_numpy(y)

    dev.SetVerbosity(verbosity)
    dev.SetSkipIteration(5)

    # construct the model
    from model import resnet
    model = resnet.resnet50(num_channels=3, num_classes=1000)

    model.train()
    model.set_optimizer(sgd)
    model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)

    # train model
    dev.Sync()
    start = time.time()
    with trange(niters) as t:
        for _ in t:
            model(tx, ty, dist_option='fp32', spars=None)

    dev.Sync()
    end = time.time()
    titer = (end - start) / float(niters)
    throughput = float(niters * batch_size * world_size) / (end - start)
    if global_rank == 0:
        print("Throughput = {} per second".format(throughput), flush=True)
        print("TotalTime={}".format(end - start), flush=True)
        print("Total={}".format(titer), flush=True)
        dev.PrintTimeProfiling()
예제 #2
0
def run(args, local_rank, world_size, nccl_id):
    sgd = opt.SGD(lr=args.lr, momentum=0.9, weight_decay=1e-5)
    sgd = opt.DistOpt(sgd,
                      nccl_id=nccl_id,
                      local_rank=local_rank,
                      world_size=world_size)
    train.run(sgd.global_rank, sgd.world_size, sgd.local_rank, args.max_epoch,
              args.batch_size, args.model, args.data, sgd, args.graph,
              args.dist_option, args.spars)
예제 #3
0
def run(args, local_rank, world_size, nccl_id):
    sgd = opt.SGD(lr=args.lr,
                  momentum=0.9,
                  weight_decay=1e-5,
                  dtype=singa_dtype[args.precision])
    sgd = opt.DistOpt(sgd,
                      nccl_id=nccl_id,
                      local_rank=local_rank,
                      world_size=world_size)
    train_cnn.run(sgd.global_rank, sgd.world_size, sgd.local_rank,
                  args.max_epoch, args.batch_size, args.model, args.data, sgd,
                  args.graph, args.verbosity, args.dist_option, args.spars,
                  args.precision)
예제 #4
0
#

# the code is modified from
# https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py

from singa import autograd
from singa import tensor
from singa import device
from singa import opt

import numpy as np
from tqdm import trange

if __name__ == "__main__":
    sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5)
    sgd = opt.DistOpt(sgd)

    from resnet import resnet50
    model = resnet50()

    if (sgd.rank_in_global == 0):
        print("Start intialization...........", flush=True)

    dev = device.create_cuda_gpu_on(sgd.rank_in_local)
    niters = 100
    batch_size = 32
    IMG_SIZE = 224

    tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev)
    ty = tensor.Tensor((batch_size, ), dev, tensor.int32)
    autograd.training = True
예제 #5
0
파일: mnist_cnn.py 프로젝트: zxr8192/singa
def train_mnist_cnn(DIST=False,
                    local_rank=None,
                    world_size=None,
                    nccl_id=None,
                    spars=0,
                    topK=False,
                    corr=True):

    # Define the hypermeters good for the mnist_cnn
    max_epoch = 10
    batch_size = 64
    sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)

    # Prepare training and valadiation data
    train_x, train_y, test_x, test_y = load_dataset()
    IMG_SIZE = 28
    num_classes = 10
    train_y = to_categorical(train_y, num_classes)
    test_y = to_categorical(test_y, num_classes)

    # Normalization
    train_x = train_x / 255
    test_x = test_x / 255

    if DIST:
        # For Distributed GPU Training
        sgd = opt.DistOpt(sgd,
                          nccl_id=nccl_id,
                          local_rank=local_rank,
                          world_size=world_size)
        dev = device.create_cuda_gpu_on(sgd.local_rank)
        # Dataset partition for distributed training
        train_x, train_y = data_partition(train_x, train_y, sgd.global_rank,
                                          sgd.world_size)
        test_x, test_y = data_partition(test_x, test_y, sgd.global_rank,
                                        sgd.world_size)
        world_size = sgd.world_size
    else:
        # For Single GPU
        dev = device.create_cuda_gpu()
        world_size = 1

    # create model
    model = CNN()

    tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
    ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
    num_train_batch = train_x.shape[0] // batch_size
    num_test_batch = test_x.shape[0] // batch_size
    idx = np.arange(train_x.shape[0], dtype=np.int32)

    if DIST:
        #Sychronize the initial parameters
        autograd.training = True
        x = np.random.randn(batch_size, 1, IMG_SIZE,
                            IMG_SIZE).astype(np.float32)
        y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32)
        tx.copy_from_numpy(x)
        ty.copy_from_numpy(y)
        out = model.forward(tx)
        loss = autograd.softmax_cross_entropy(out, ty)
        for p, g in autograd.backward(loss):
            synchronize(p, sgd)

    # Training and Evaulation Loop
    for epoch in range(max_epoch):
        start_time = time.time()
        np.random.shuffle(idx)

        if ((DIST == False) or (sgd.global_rank == 0)):
            print('Starting Epoch %d:' % (epoch))

        # Training Phase
        autograd.training = True
        train_correct = np.zeros(shape=[1], dtype=np.float32)
        test_correct = np.zeros(shape=[1], dtype=np.float32)
        train_loss = np.zeros(shape=[1], dtype=np.float32)

        for b in range(num_train_batch):
            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
            x = augmentation(x, batch_size)
            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out = model.forward(tx)
            loss = autograd.softmax_cross_entropy(out, ty)
            train_correct += accuracy(tensor.to_numpy(out), y)
            train_loss += tensor.to_numpy(loss)[0]
            if DIST:
                if (spars == 0):
                    sgd.backward_and_update(loss, threshold=50000)
                else:
                    sgd.backward_and_sparse_update(loss,
                                                   spars=spars,
                                                   topK=topK,
                                                   corr=corr)
            else:
                sgd.backward_and_update(loss)

        if DIST:
            # Reduce the Evaluation Accuracy and Loss from Multiple Devices
            reducer = tensor.Tensor((1,), dev, tensor.float32)
            train_correct = reduce_variable(train_correct, sgd, reducer)
            train_loss = reduce_variable(train_loss, sgd, reducer)

        # Output the Training Loss and Accuracy
        if ((DIST == False) or (sgd.global_rank == 0)):
            print('Training loss = %f, training accuracy = %f' %
                  (train_loss, train_correct /
                   (num_train_batch * batch_size * world_size)),
                  flush=True)

        # Evaluation Phase
        autograd.training = False
        for b in range(num_test_batch):
            x = test_x[b * batch_size:(b + 1) * batch_size]
            y = test_y[b * batch_size:(b + 1) * batch_size]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out_test = model.forward(tx)
            test_correct += accuracy(tensor.to_numpy(out_test), y)

        if DIST:
            # Reduce the Evaulation Accuracy from Multiple Devices
            test_correct = reduce_variable(test_correct, sgd, reducer)

        # Output the Evaluation Accuracy
        if ((DIST == False) or (sgd.global_rank == 0)):
            print('Evaluation accuracy = %f, Elapsed Time = %fs' %
                  (test_correct / (num_test_batch * batch_size * world_size),
                   time.time() - start_time),
                  flush=True)
예제 #6
0
def train_mnist_cnn(sgd,
                    max_epoch,
                    batch_size,
                    DIST=False,
                    data_partition=None,
                    gpu_num=None,
                    gpu_per_node=None,
                    nccl_id=None):

    # Prepare training and valadiation data
    train_x, train_y, test_x, test_y = load_dataset()
    IMG_SIZE = 28
    num_classes = 10
    train_y = to_categorical(train_y, num_classes)
    test_y = to_categorical(test_y, num_classes)

    # Normalization
    train_x = train_x / 255
    test_x = test_x / 255

    if DIST:
        # For Distributed GPU Training
        sgd = opt.DistOpt(sgd,
                          nccl_id=nccl_id,
                          gpu_num=gpu_num,
                          gpu_per_node=gpu_per_node)
        dev = device.create_cuda_gpu_on(sgd.rank_in_local)
        # Dataset partition for distributed training
        train_x, train_y = data_partition(train_x, train_y, sgd.rank_in_global,
                                          sgd.world_size)
        test_x, test_y = data_partition(test_x, test_y, sgd.rank_in_global,
                                        sgd.world_size)
        world_size = sgd.world_size
    else:
        # For Single GPU
        dev = device.create_cuda_gpu()
        world_size = 1

    # create model
    model = CNN()

    tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev,
                       tensor.float32)
    ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
    num_train_batch = train_x.shape[0] // batch_size
    num_test_batch = test_x.shape[0] // batch_size
    idx = np.arange(train_x.shape[0], dtype=np.int32)

    if DIST:
        #Sychronize the initial parameters
        autograd.training = True
        x = np.random.randn(batch_size, 1, IMG_SIZE,
                            IMG_SIZE).astype(np.float32)
        y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32)
        tx.copy_from_numpy(x)
        ty.copy_from_numpy(y)
        out = model.forward(tx)
        loss = autograd.softmax_cross_entropy(out, ty)
        for p, g in autograd.backward(loss):
            sychronize(p, sgd)

    # Training and Evaulation Loop
    for epoch in range(max_epoch):
        start_time = time.time()
        np.random.shuffle(idx)

        if ((DIST == False) or (sgd.rank_in_global == 0)):
            print('Starting Epoch %d:' % (epoch))

        # Training Phase
        autograd.training = True
        train_correct = np.zeros(shape=[1], dtype=np.float32)
        test_correct = np.zeros(shape=[1], dtype=np.float32)
        train_loss = np.zeros(shape=[1], dtype=np.float32)

        for b in range(num_train_batch):
            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
            x = augmentation(x, batch_size)
            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out = model.forward(tx)
            loss = autograd.softmax_cross_entropy(out, ty)
            train_correct += accuracy(tensor.to_numpy(out), y)
            train_loss += tensor.to_numpy(loss)[0]
            plist = []
            for p, g in autograd.backward(loss):
                if DIST:
                    sgd.all_reduce(g)
                plist.append((p, g))
            if DIST:
                sgd.wait()
            for p, g in plist:
                sgd.update(p, g)

        if DIST:
            # Reduce the Evaluation Accuracy and Loss from Multiple Devices
            reducer = tensor.Tensor((1, ), dev, tensor.float32)
            train_correct = reduce_variable(train_correct, sgd, reducer)
            train_loss = reduce_variable(train_loss, sgd, reducer)

        # Output the Training Loss and Accuracy
        if ((DIST == False) or (sgd.rank_in_global == 0)):
            print('Training loss = %f, training accuracy = %f' %
                  (train_loss, train_correct /
                   (num_train_batch * batch_size * world_size)),
                  flush=True)

        # Evaluation Phase
        autograd.training = False
        for b in range(num_test_batch):
            x = test_x[b * batch_size:(b + 1) * batch_size]
            y = test_y[b * batch_size:(b + 1) * batch_size]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out_test = model.forward(tx)
            test_correct += accuracy(tensor.to_numpy(out_test), y)

        if DIST:
            # Reduce the Evaulation Accuracy from Multiple Devices
            test_correct = reduce_variable(test_correct, sgd, reducer)

        # Output the Evaluation Accuracy
        if ((DIST == False) or (sgd.rank_in_global == 0)):
            print('Evaluation accuracy = %f, Elapsed Time = %fs' %
                  (test_correct / (num_test_batch * batch_size * world_size),
                   time.time() - start_time),
                  flush=True)
예제 #7
0
def train_cifar10(sgd, max_epoch, batch_size, DIST=False, data_partition=None, gpu_num=None, gpu_per_node=None, nccl_id=None, partial_update=False):

    train_x, train_y = load_train_data()
    test_x, test_y = load_test_data()
    train_x, test_x = normalize_for_resnet(train_x, test_x)
    IMG_SIZE = 224
    num_classes=10

    if DIST:
        # For Distributed GPU Training
        sgd = opt.DistOpt(sgd, nccl_id=nccl_id, gpu_num=gpu_num, gpu_per_node=gpu_per_node)
        dev = device.create_cuda_gpu_on(sgd.rank_in_local)
        # Dataset partition for distributed training
        train_x, train_y = data_partition(train_x, train_y, sgd.rank_in_global, sgd.world_size)
        test_x, test_y = data_partition(test_x, test_y, sgd.rank_in_global, sgd.world_size)
        world_size = sgd.world_size
    else:
        # For Single GPU
        dev = device.create_cuda_gpu()
        world_size = 1

    from resnet import resnet50
    model = resnet50(num_classes=num_classes)

    tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
    num_train_batch = train_x.shape[0] // batch_size
    num_test_batch = test_x.shape[0] // batch_size
    idx = np.arange(train_x.shape[0], dtype=np.int32)

    if DIST:
        #Sychronize the initial parameters
        autograd.training = True
        x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
        y = np.zeros( shape=(batch_size,), dtype=np.int32)
        tx.copy_from_numpy(x)
        ty.copy_from_numpy(y)
        out = model(tx)
        loss = autograd.softmax_cross_entropy(out, ty)
        param = []
        for p, _ in autograd.backward(loss):
            sychronize(p, sgd)
            param.append(p)

    for epoch in range(max_epoch):
        start_time = time.time()
        np.random.shuffle(idx)

        if ((DIST == False) or (sgd.rank_in_global == 0)):
            print('Starting Epoch %d:' % (epoch))

        #Training Phase
        autograd.training = True
        train_correct = np.zeros(shape=[1],dtype=np.float32)
        test_correct = np.zeros(shape=[1],dtype=np.float32)
        train_loss = np.zeros(shape=[1],dtype=np.float32)

        for b in range(num_train_batch):
            x = train_x[idx[b * batch_size: (b + 1) * batch_size]]
            x = augmentation(x, batch_size)
            x = resize_dataset(x,IMG_SIZE)
            y = train_y[idx[b * batch_size: (b + 1) * batch_size]]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out = model(tx)
            loss = autograd.softmax_cross_entropy(out, ty)               
            train_correct += accuracy(tensor.to_numpy(out), to_categorical(y, num_classes)).astype(np.float32)
            train_loss += tensor.to_numpy(loss)[0]
            if not partial_update:
                sgd.backward_and_update(loss)
            else:
                sgd.backward_and_partial_update(loss)

        if DIST:
            # Reduce the Evaluation Accuracy and Loss from Multiple Devices
            reducer = tensor.Tensor((1,), dev, tensor.float32)
            train_correct = reduce_variable(train_correct, sgd, reducer)
            train_loss = reduce_variable(train_loss, sgd, reducer)

        # Output the Training Loss and Accuracy
        if ((DIST == False) or (sgd.rank_in_global == 0)):
            print('Training loss = %f, training accuracy = %f' % (train_loss, train_correct / (num_train_batch*batch_size*world_size)), flush=True)

        if partial_update:
            # sychronize parameters before evaluation phase
            for p in param:
                sychronize(p, sgd)

        #Evaulation Phase
        autograd.training = False
        for b in range(num_test_batch):
            x = test_x[b * batch_size: (b + 1) * batch_size]
            x = resize_dataset(x,IMG_SIZE)
            y = test_y[b * batch_size: (b + 1) * batch_size]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out_test = model(tx)
            test_correct += accuracy(tensor.to_numpy(out_test), to_categorical(y, num_classes))

        if DIST:
            # Reduce the Evaulation Accuracy from Multiple Devices
            test_correct = reduce_variable(test_correct, sgd, reducer)

        # Output the Evaluation Accuracy
        if ((DIST == False) or (sgd.rank_in_global == 0)):
            print('Evaluation accuracy = %f, Elapsed Time = %fs' % (test_correct / (num_test_batch*batch_size*world_size), time.time() - start_time ), flush=True)
예제 #8
0
def train_resnet(DIST='singa', graph=True, sequential=False):

    # Define the hypermeters good for the train_resnet
    niters = 100
    batch_size = 32
    sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5)

    IMG_SIZE = 224

    # For distributed training, sequential has better throughput in the current version
    if DIST=='singa':
        sgd = opt.DistOpt(sgd)
        world_size = sgd.world_size
        local_rank = sgd.local_rank
        global_rank = sgd.global_rank
        sequential = True
    else:
        kv_type = 'dist_sync' #set synchronization mode
        kv = singa_kvstore.create_kvstore(kv_type,'sgd',learning_rate=0.005)
        global_rank = kv.rank
        world_size = kv.num_workers
        sequential = True
        
    dev = device.create_cuda_gpu_on(kv.rank)

    tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev)
    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
    x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
    y = np.random.randint(0, 1000, batch_size, dtype=np.int32)
    tx.copy_from_numpy(x)
    ty.copy_from_numpy(y)

    # construct the model
    from model import resnet
    model = resnet.resnet50(num_channels=3, num_classes=1000)

    model.train()
    model.on_device(dev)
    model.set_optimizer(sgd)
    model.graph(graph, sequential)

    # train model
    if DIST=='singa':
       dev.Sync()
    compute_time = 0.0
    syn_time = 0.0
    start = time.time()
    with trange(niters) as t:
        for _ in t:
            out = model(tx)
            compute_start = time.time()
            loss = model.loss(out, ty)
            compute_time += time.time()-compute_start
            if DIST=='singa':
               syn_start = time.time()
               model.optim(loss, dist_option='fp32', spars=None)
               syn_time += time.time()-syn_start
            else:
               #autograd.training = True
               syn_start = time.time()
               singa_kvstore.backward_and_update(kv,loss)
               syn_time += time.time()-syn_start
    if DIST=='singa':
       dev.Sync()
    end = time.time()
    compute_time = compute_time /float(niters)
    syn_time = syn_time/ float(niters)
    titer = (end - start) / float(niters)
    throughput = float(niters * batch_size * world_size) / (end - start)
    if global_rank == 0:
        print("compute_time = {}".format(compute_time),flush=True)
        print("syn_time = {}".format(syn_time),flush=True)
        print("Throughput = {} per second".format(throughput), flush=True)
        print("TotalTime={}".format(end - start), flush=True)
        print("Total={}".format(titer), flush=True)