예제 #1
0
def onnx_to_singa(niter, use_cpu=False):
    if use_cpu:
        print("Using CPU")
        dev = device.get_default_device()
    else:
        print("Using GPU")
        dev = device.create_cuda_gpu()
    model = sonnx.load("mlp.onnx")
    backend = sonnx.prepare(model, device=dev)
    sgd = opt.SGD(0.1)
    inputs = Tensor(
        data=data,
        device=dev,
        requires_grad=False,
        stores_grad=False,
        name="input",
    )
    target = Tensor(
        data=label,
        device=dev,
        requires_grad=False,
        stores_grad=False,
        name="target",
    )

    for i in range(100):
        y = backend.run([inputs])[0]
        loss = autograd.softmax_cross_entropy(y, target)
        for p, gp in autograd.backward(loss):
            sgd.update(p, gp)
        loss_rate = tensor.to_numpy(loss)[0]
        accuracy_rate = accuracy(tensor.to_numpy(y), label)

        print("Iter {}, accurate={}, loss={}".format(i, accuracy_rate, loss_rate))
예제 #2
0
def singa_to_onnx(epochs, use_cpu=False, batchsize=32):
    sgd = opt.SGD(lr=0.1)

    # operations initialization
    conv1 = autograd.Conv2d(1, 8, 3, 2, padding=1) # 28 - 14
    conv2 = autograd.Conv2d(8, 4, 3, 2, padding=1) # 14 - 7
    pooling = autograd.MaxPool2d(3, 2, padding=1) # 7 - 4
    linear = autograd.Linear(64, 10)

    def forward(x, t):
        y = conv1(x)
        y = autograd.relu(y)
        y = conv2(y)
        y = autograd.relu(y)
        y = pooling(y)
        y = autograd.flatten(y)
        y = linear(y)
        loss = autograd.softmax_cross_entropy(y, t)
        return loss, y

    autograd.training = True
    (x_train, y_train), (x_test, y_test), dev = common(use_cpu)

    niter = 1 # x_train.shape[0] // batchsize
    for epoch in range(epochs):
        accuracy_rate = 0.0
        loss_rate = 0.0
        for i in range(niter):
            inputs = tensor.Tensor(
                device=dev,
                data=x_train[i * batchsize : (i + 1) * batchsize],
                stores_grad=False,
                name="input",
            )
            targets = tensor.Tensor(
                device=dev,
                data=y_train[i * batchsize : (i + 1) * batchsize],
                requires_grad=False,
                stores_grad=False,
                name="target",
            )
            loss, y = forward(inputs, targets)
            accuracy_rate += accuracy(
                tensor.to_numpy(y), y_train[i * batchsize : (i + 1) * batchsize]
            )
            loss_rate += tensor.to_numpy(loss)[0]
            for p, gp in autograd.backward(loss):
                sgd.update(p, gp)
        print( "accuracy is {}, loss is {}".format( accuracy_rate / niter, loss_rate / niter))
    model = sonnx.to_onnx_model([inputs], [y])
    sonnx.save(model, "cnn.onnx")
    def test_vanillaRNN_gpu_tiny_ops_shape_check(self):
        # gradients shape check.
        inputs, target, h0 = prepare_inputs_targets_for_rnn_test()
        rnn = autograd.RNN(3, 2)

        hs, _ = rnn(inputs, h0)

        loss = autograd.softmax_cross_entropy(hs[0], target[0])
        for i in range(1, len(hs)):
            l = autograd.softmax_cross_entropy(hs[i], target[i])
            loss = autograd.add(loss, l)
        # d=autograd.infer_dependency(loss.creator)
        # print(d)
        for t, dt in autograd.backward(loss):
            self.check_shape(t.shape, dt.shape)
예제 #4
0
def singa_to_onnx(niter, use_cpu=False):
    if use_cpu:
        print("Using CPU")
        dev = device.get_default_device()
    else:
        print("Using GPU")
        dev = device.create_cuda_gpu()
    inputs = Tensor(
        data=data,
        device=dev,
        requires_grad=False,
        stores_grad=False,
        name="input",
    )
    target = Tensor(
        data=label,
        device=dev,
        requires_grad=False,
        stores_grad=False,
        name="target",
    )

    w0 = Tensor(shape=(2, 3), device=dev, requires_grad=True, stores_grad=True)
    w0.gaussian(0.0, 0.1)
    b0 = Tensor(shape=(3,), device=dev, requires_grad=True, stores_grad=True)
    b0.set_value(0.0)

    w1 = Tensor(shape=(3, 2), device=dev, requires_grad=True, stores_grad=True)
    w1.gaussian(0.0, 0.1)
    b1 = Tensor(shape=(2,), device=dev, requires_grad=True, stores_grad=True)
    b1.set_value(0.0)

    sgd = opt.SGD(0.1)
    # training process
    for i in range(100):
        x = autograd.matmul(inputs, w0)
        x = autograd.add_bias(x, b0)
        x = autograd.relu(x)
        x = autograd.matmul(x, w1)
        x = autograd.add_bias(x, b1)
        loss = autograd.softmax_cross_entropy(x, target)
        for p, gp in autograd.backward(loss):
            sgd.update(p, gp)

        print("training loss = ", tensor.to_numpy(loss)[0])
    sonnx.export([inputs], [x], file_path="mlp.onnx")
    def test_LSTM_gpu_tiny_ops_shape_check(self):
        # gradients shape check.
        inputs, target, h0 = prepare_inputs_targets_for_rnn_test()
        c_0 = np.random.random((2, 1)).astype(np.float32)
        c0 = tensor.Tensor(device=gpu_dev, data=c_0)

        rnn = autograd.LSTM(3, 2)

        hs, _, _ = rnn(inputs, (h0, c0))
        loss = autograd.softmax_cross_entropy(hs[0], target[0])

        for i in range(1, len(hs)):
            l = autograd.softmax_cross_entropy(hs[i], target[i])
            loss = autograd.add(loss, l)
        # d=autograd.infer_dependency(loss.creator)
        # print(d)
        for t, dt in autograd.backward(loss):
            self.check_shape(t.shape, dt.shape)
예제 #6
0
def onnx_to_singa(epochs, use_cpu=False, batchsize=32):
    (x_train, y_train), (x_test, y_test), dev = common(use_cpu)
    model = sonnx.load("cnn.onnx")
    backend = sonnx.prepare(model, dev)
    autograd.training = True
    sgd = opt.SGD(lr=0.01)
    niter = x_train.shape[0] // batchsize
    for epoch in range(epochs):
        accuracy_rate = 0.0
        loss_rate = 0.0
        for i in range(niter):
            inputs = tensor.Tensor(
                device=dev,
                data=x_train[i * batchsize : (i + 1) * batchsize],
                stores_grad=False,
                name="input",
            )
            targets = tensor.Tensor(
                device=dev,
                data=y_train[i * batchsize : (i + 1) * batchsize],
                requires_grad=False,
                stores_grad=False,
                name="target",
            )
            y = backend.run([inputs])[0]
            loss = autograd.softmax_cross_entropy(y, targets)

            accuracy_rate += accuracy(
                tensor.to_numpy(y), y_train[i * batchsize : (i + 1) * batchsize]
            )
            loss_rate += tensor.to_numpy(loss)[0]

            for p, gp in autograd.backward(loss):
                sgd.update(p, gp)

        print("accuracy is {}, loss is {}".format(accuracy_rate / niter, loss_rate / niter))
예제 #7
0
        return x


if __name__ == '__main__':
    model = Xception(num_classes=1000)
    print('Start intialization............')
    dev = device.create_cuda_gpu_on(0)
    #dev = device.create_cuda_gpu()

    niters = 20
    batch_size = 16
    IMG_SIZE = 299
    sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5)

    tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev)
    ty = tensor.Tensor((batch_size, ), dev, tensor.int32)
    autograd.training = True
    x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
    y = np.random.randint(0, 1000, batch_size, dtype=np.int32)
    tx.copy_from_numpy(x)
    ty.copy_from_numpy(y)

    with trange(niters) as t:
        for b in t:
            x = model(tx)
            loss = autograd.softmax_cross_entropy(x, ty)
            for p, g in autograd.backward(loss):
                # print(p.shape, g.shape)
                sgd.update(p, g)
                # pass
예제 #8
0
def train_cifar10(sgd, max_epoch, batch_size, DIST=False, data_partition=None, gpu_num=None, gpu_per_node=None, nccl_id=None, partial_update=False):

    train_x, train_y = load_train_data()
    test_x, test_y = load_test_data()
    train_x, test_x = normalize_for_resnet(train_x, test_x)
    IMG_SIZE = 224
    num_classes=10

    if DIST:
        # For Distributed GPU Training
        sgd = opt.DistOpt(sgd, nccl_id=nccl_id, gpu_num=gpu_num, gpu_per_node=gpu_per_node)
        dev = device.create_cuda_gpu_on(sgd.rank_in_local)
        # Dataset partition for distributed training
        train_x, train_y = data_partition(train_x, train_y, sgd.rank_in_global, sgd.world_size)
        test_x, test_y = data_partition(test_x, test_y, sgd.rank_in_global, sgd.world_size)
        world_size = sgd.world_size
    else:
        # For Single GPU
        dev = device.create_cuda_gpu()
        world_size = 1

    from resnet import resnet50
    model = resnet50(num_classes=num_classes)

    tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
    num_train_batch = train_x.shape[0] // batch_size
    num_test_batch = test_x.shape[0] // batch_size
    idx = np.arange(train_x.shape[0], dtype=np.int32)

    if DIST:
        #Sychronize the initial parameters
        autograd.training = True
        x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
        y = np.zeros( shape=(batch_size,), dtype=np.int32)
        tx.copy_from_numpy(x)
        ty.copy_from_numpy(y)
        out = model(tx)
        loss = autograd.softmax_cross_entropy(out, ty)
        param = []
        for p, _ in autograd.backward(loss):
            sychronize(p, sgd)
            param.append(p)

    for epoch in range(max_epoch):
        start_time = time.time()
        np.random.shuffle(idx)

        if ((DIST == False) or (sgd.rank_in_global == 0)):
            print('Starting Epoch %d:' % (epoch))

        #Training Phase
        autograd.training = True
        train_correct = np.zeros(shape=[1],dtype=np.float32)
        test_correct = np.zeros(shape=[1],dtype=np.float32)
        train_loss = np.zeros(shape=[1],dtype=np.float32)

        for b in range(num_train_batch):
            x = train_x[idx[b * batch_size: (b + 1) * batch_size]]
            x = augmentation(x, batch_size)
            x = resize_dataset(x,IMG_SIZE)
            y = train_y[idx[b * batch_size: (b + 1) * batch_size]]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out = model(tx)
            loss = autograd.softmax_cross_entropy(out, ty)               
            train_correct += accuracy(tensor.to_numpy(out), to_categorical(y, num_classes)).astype(np.float32)
            train_loss += tensor.to_numpy(loss)[0]
            if not partial_update:
                sgd.backward_and_update(loss)
            else:
                sgd.backward_and_partial_update(loss)

        if DIST:
            # Reduce the Evaluation Accuracy and Loss from Multiple Devices
            reducer = tensor.Tensor((1,), dev, tensor.float32)
            train_correct = reduce_variable(train_correct, sgd, reducer)
            train_loss = reduce_variable(train_loss, sgd, reducer)

        # Output the Training Loss and Accuracy
        if ((DIST == False) or (sgd.rank_in_global == 0)):
            print('Training loss = %f, training accuracy = %f' % (train_loss, train_correct / (num_train_batch*batch_size*world_size)), flush=True)

        if partial_update:
            # sychronize parameters before evaluation phase
            for p in param:
                sychronize(p, sgd)

        #Evaulation Phase
        autograd.training = False
        for b in range(num_test_batch):
            x = test_x[b * batch_size: (b + 1) * batch_size]
            x = resize_dataset(x,IMG_SIZE)
            y = test_y[b * batch_size: (b + 1) * batch_size]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out_test = model(tx)
            test_correct += accuracy(tensor.to_numpy(out_test), to_categorical(y, num_classes))

        if DIST:
            # Reduce the Evaulation Accuracy from Multiple Devices
            test_correct = reduce_variable(test_correct, sgd, reducer)

        # Output the Evaluation Accuracy
        if ((DIST == False) or (sgd.rank_in_global == 0)):
            print('Evaluation accuracy = %f, Elapsed Time = %fs' % (test_correct / (num_test_batch*batch_size*world_size), time.time() - start_time ), flush=True)
예제 #9
0
label = to_categorical(label, 2).astype(np.float32)
print('train_data_shape:', data.shape)
print('train_label_shape:', label.shape)

inputs = Tensor(data=data)
target = Tensor(data=label)

linear1 = autograd.Linear(3, 2)
linear2 = autograd.Linear(2, 2)
linear3 = autograd.Linear(2, 2)

sgd = optimizer.SGD(0.00)

# training process
for i in range(1):
    x = linear1(inputs)
    x = autograd.relu(x)
    x1 = linear2(x)
    x2 = linear3(x)
    x3 = autograd.add(x1, x2)
    y = autograd.softmax(x3)
    loss = autograd.cross_entropy(y, target)
    gradient = autograd.backward(loss)
    for p, gp in gradient:
        sgd.apply(0, gp, p, '')
    if (i % 100 == 0):
        print('training loss = ', tensor.to_numpy(loss)[0])

model = sonnx.to_onnx_model([inputs], [y])

onnx.save(model, 'linear.onnx')
예제 #10
0
    def backward_and_spars_update(self,
                                  loss,
                                  threshold=2097152,
                                  spars=0.05,
                                  topK=False,
                                  corr=True):
        """ Performs backward propagation from the loss and parameter update with sparsification.

        THIS IS A EXPERIMENTAL FUNCTION FOR RESEARCH PURPOSE:
        From the loss, it performs backward propagation to get the gradients and do the parameter 
        update. It fuses the tensors with size smaller than the threshold value to reduce network
        latency, as well as using sparsification schemes to transfer only the gradient elements which
        are significant.

        Args:
                loss(Tensor): loss is the objective function of the deep learning model 
                optimization, e.g. for classification problem it can be the output of the
                softmax_cross_entropy function.
                threshold(int): threshold is a parameter to control performance in fusing
                the tensors. For the tensors of sizes smaller than threshold, they are to
                be accumulated and fused before the all reduce operation. For the tensors 
                of its size larger than the threshold value, they are to be reduced directly
                without fusion.
                spars(float): a parameter to control sparsity as defined below 
                topK(bool): When topK is False, it sparsifies the gradient with absolute 
                value >= sparsWhen topK is True, it sparsifies a fraction of total gradient
                number equals to spars,  E.g. when spars = 0.01, it sparsifies 1 % of the
                total gradient elements
                corr(bool): whether to use the local accumulate gradient for correction

        Attributes:
                self.sparsInit: A counter to determine which partition to perform all-reduce.
                self.gradAccumulation: Local gradient accumulation
        """
        if ((not hasattr(self, "sparsInit")) and corr):
            self.gradAccumulation = []
            self.sparsInit = False
        plist = []
        acc = 0
        k = -1
        glist = []
        for p, g in autograd.backward(loss):
            if g.size() > threshold:
                # larger than threshold -> reduced directly
                k += 1
                if (corr and (not self.sparsInit)):
                    # create a tensor for the gradient accumulation
                    self.gradAccumulation.append(
                        tensor.Tensor((g.size(), ), p.device, p.dtype))
                    self.gradAccumulation[k].set_value(0.0)
                if corr:
                    self.sparsification(g.data, self.gradAccumulation[k].data,
                                        spars, topK)
                else:
                    self.sparsification(g.data, None, spars, topK)
            else:
                # smaller than threshold -> accumulate
                glist.append(g.data)
                acc += g.size()
                if (acc > threshold):
                    k += 1
                    if (corr and (not self.sparsInit)):
                        # create a tensor for the gradient accumulation
                        self.gradAccumulation.append(
                            tensor.Tensor((acc, ), p.device, p.dtype))
                        self.gradAccumulation[k].set_value(0.0)
                    if corr:
                        self.fused_sparsification(
                            glist, self.gradAccumulation[k].data, spars, topK)
                    else:
                        self.fused_sparsification(glist, None, spars, topK)
                    acc = 0
                    glist = []
            plist.append((p, g))
        if glist:
            k += 1
            if (corr and (not self.sparsInit)):
                # create a tensor for the gradient accumulation
                self.gradAccumulation.append(
                    tensor.Tensor((acc, ), p.device, p.dtype))
                self.gradAccumulation[k].set_value(0.0)
            if corr:
                self.fused_sparsification(glist, self.gradAccumulation[k].data,
                                          spars, topK)
            else:
                self.fused_sparsification(glist, None, spars, topK)
        self.wait()
        for p, g in plist:
            self.update(p, g)
        self.sparsInit = True
예제 #11
0
 def backward_and_update(self, loss):
     for p, g in autograd.backward(loss):
         self.update(p, g)
예제 #12
0
    def forward(x, t):
        y = conv1(x)
        y = autograd.relu(y)
        y = conv2(y)
        y = autograd.relu(y)
        y = autograd.max_pool_2d(y)
        y = autograd.flatten(y)
        y = linear(y)
        y = autograd.soft_max(y)
        loss = autograd.cross_entropy(y, t)
        return loss, y

    autograd.training = True
    for epoch in range(epochs):
        for i in range(batch_number):
            inputs = tensor.Tensor(data=x_train[i * 100:(1 + i) * 100, :])
            targets = tensor.Tensor(data=y_train[i * 100:(1 + i) * 100, :])

            loss, y = forward(inputs, targets)

            accuracy_rate = accuracy(autograd.ctensor2numpy(
                y.data), autograd.ctensor2numpy(targets.data))
            if (i % 5 == 0):
                print('accuracy is:', accuracy_rate, 'loss is:',
                      autograd.ctensor2numpy(loss.data)[0])

            in_grads = autograd.backward(loss)

            for param in in_grads:
                sgd.apply(0, in_grads[param], param, '')
예제 #13
0
def train_mnist_cnn(DIST=False,
                    local_rank=None,
                    world_size=None,
                    nccl_id=None,
                    spars=0,
                    topK=False,
                    corr=True):

    # Define the hypermeters good for the mnist_cnn
    max_epoch = 10
    batch_size = 64
    sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)

    # Prepare training and valadiation data
    train_x, train_y, test_x, test_y = load_dataset()
    IMG_SIZE = 28
    num_classes = 10
    train_y = to_categorical(train_y, num_classes)
    test_y = to_categorical(test_y, num_classes)

    # Normalization
    train_x = train_x / 255
    test_x = test_x / 255

    if DIST:
        # For Distributed GPU Training
        sgd = opt.DistOpt(sgd,
                          nccl_id=nccl_id,
                          local_rank=local_rank,
                          world_size=world_size)
        dev = device.get_default_device(sgd.local_rank)
        # Dataset partition for distributed training
        train_x, train_y = data_partition(train_x, train_y, sgd.global_rank,
                                          sgd.world_size)
        test_x, test_y = data_partition(test_x, test_y, sgd.global_rank,
                                        sgd.world_size)
        world_size = sgd.world_size
    else:
        # For Single GPU
        dev = device.get_default_device()
        world_size = 1

    # create model
    model = CNN()

    tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev,
                       tensor.float32)
    ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
    num_train_batch = train_x.shape[0] // batch_size
    num_test_batch = test_x.shape[0] // batch_size
    idx = np.arange(train_x.shape[0], dtype=np.int32)

    if DIST:
        #Sychronize the initial parameters
        autograd.training = True
        x = np.random.randn(batch_size, 1, IMG_SIZE,
                            IMG_SIZE).astype(np.float32)
        y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32)
        tx.copy_from_numpy(x)
        ty.copy_from_numpy(y)
        out = model.forward(tx)
        loss = autograd.softmax_cross_entropy(out, ty)
        for p, g in autograd.backward(loss):
            print('tensor.data.type is %s' % type(p.data).__name__)
            synchronize(p, sgd)

    # Training and Evaulation Loop
    for epoch in range(max_epoch):
        start_time = time.time()
        np.random.shuffle(idx)

        if ((DIST == False) or (sgd.global_rank == 0)):
            print('Starting Epoch %d:' % (epoch))

        # Training Phase
        autograd.training = True
        train_correct = np.zeros(shape=[1], dtype=np.float32)
        test_correct = np.zeros(shape=[1], dtype=np.float32)
        train_loss = np.zeros(shape=[1], dtype=np.float32)
        time_start = time.time()
        for b in range(num_train_batch):
            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
            x = augmentation(x, batch_size)
            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out = model.forward(tx)
            loss = autograd.softmax_cross_entropy(out, ty)
            train_correct += accuracy(tensor.to_numpy(out), y)
            train_loss += tensor.to_numpy(loss)[0]
            if DIST:
                if (spars == 0):
                    sgd.backward_and_update(loss, threshold=50000)
                else:
                    sgd.backward_and_sparse_update(loss,
                                                   spars=spars,
                                                   topK=topK,
                                                   corr=corr)
            else:
                sgd.backward_and_update(loss)

            # Evaluation Phase
            if b % 20 != 0:
                continue
            autograd.training = False
            num_test_batch_inside = 20
            test_correct = 0
            for b in range(num_test_batch_inside):
                x = test_x[b * batch_size:(b + 1) * batch_size]
                y = test_y[b * batch_size:(b + 1) * batch_size]
                tx.copy_from_numpy(x)
                ty.copy_from_numpy(y)
                out_test = model.forward(tx)
                test_correct += accuracy(tensor.to_numpy(out_test), y)
            print('Evaluation accuracy = %f' %
                  (test_correct / (batch_size * num_test_batch_inside)),
                  flush=True)
            autograd.training = True

        print('epoch time is %f' % (time.time() - time_start))
        if DIST:
            # Reduce the Evaluation Accuracy and Loss from Multiple Devices
            reducer = tensor.Tensor((1, ), dev, tensor.float32)
            train_correct = reduce_variable(train_correct, sgd, reducer)
            train_loss = reduce_variable(train_loss, sgd, reducer)

        # Output the Training Loss and Accuracy
        if ((DIST == False) or (sgd.global_rank == 0)):
            print('Training loss = %f, training accuracy = %f' %
                  (train_loss, train_correct /
                   (num_train_batch * batch_size * world_size)),
                  flush=True)

        # Evaluation Phase
        autograd.training = False
        for b in range(num_test_batch):
            x = test_x[b * batch_size:(b + 1) * batch_size]
            y = test_y[b * batch_size:(b + 1) * batch_size]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out_test = model.forward(tx)
            test_correct += accuracy(tensor.to_numpy(out_test), y)

        if DIST:
            # Reduce the Evaulation Accuracy from Multiple Devices
            test_correct = reduce_variable(test_correct, sgd, reducer)

        # Output the Evaluation Accuracy
        if ((DIST == False) or (sgd.global_rank == 0)):
            print('Evaluation accuracy = %f, Elapsed Time = %fs' %
                  (test_correct / (num_test_batch * batch_size * world_size),
                   time.time() - start_time),
                  flush=True)
예제 #14
0
def train_mnist_cnn(sgd,
                    max_epoch,
                    batch_size,
                    DIST=False,
                    data_partition=None,
                    gpu_num=None,
                    gpu_per_node=None,
                    nccl_id=None):

    # Prepare training and valadiation data
    train_x, train_y, test_x, test_y = load_dataset()
    IMG_SIZE = 28
    num_classes = 10
    train_y = to_categorical(train_y, num_classes)
    test_y = to_categorical(test_y, num_classes)

    # Normalization
    train_x = train_x / 255
    test_x = test_x / 255

    if DIST:
        # For Distributed GPU Training
        sgd = opt.DistOpt(sgd,
                          nccl_id=nccl_id,
                          gpu_num=gpu_num,
                          gpu_per_node=gpu_per_node)
        dev = device.create_cuda_gpu_on(sgd.rank_in_local)
        # Dataset partition for distributed training
        train_x, train_y = data_partition(train_x, train_y, sgd.rank_in_global,
                                          sgd.world_size)
        test_x, test_y = data_partition(test_x, test_y, sgd.rank_in_global,
                                        sgd.world_size)
        world_size = sgd.world_size
    else:
        # For Single GPU
        dev = device.create_cuda_gpu()
        world_size = 1

    # create model
    model = CNN()

    tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev,
                       tensor.float32)
    ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
    num_train_batch = train_x.shape[0] // batch_size
    num_test_batch = test_x.shape[0] // batch_size
    idx = np.arange(train_x.shape[0], dtype=np.int32)

    if DIST:
        #Sychronize the initial parameters
        autograd.training = True
        x = np.random.randn(batch_size, 1, IMG_SIZE,
                            IMG_SIZE).astype(np.float32)
        y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32)
        tx.copy_from_numpy(x)
        ty.copy_from_numpy(y)
        out = model.forward(tx)
        loss = autograd.softmax_cross_entropy(out, ty)
        for p, g in autograd.backward(loss):
            sychronize(p, sgd)

    # Training and Evaulation Loop
    for epoch in range(max_epoch):
        start_time = time.time()
        np.random.shuffle(idx)

        if ((DIST == False) or (sgd.rank_in_global == 0)):
            print('Starting Epoch %d:' % (epoch))

        # Training Phase
        autograd.training = True
        train_correct = np.zeros(shape=[1], dtype=np.float32)
        test_correct = np.zeros(shape=[1], dtype=np.float32)
        train_loss = np.zeros(shape=[1], dtype=np.float32)

        for b in range(num_train_batch):
            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
            x = augmentation(x, batch_size)
            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out = model.forward(tx)
            loss = autograd.softmax_cross_entropy(out, ty)
            train_correct += accuracy(tensor.to_numpy(out), y)
            train_loss += tensor.to_numpy(loss)[0]
            for p, g in autograd.backward(loss):
                sgd.update(p, g)

        if DIST:
            # Reduce the Evaluation Accuracy and Loss from Multiple Devices
            reducer = tensor.Tensor((1, ), dev, tensor.float32)
            train_correct = reduce_variable(train_correct, sgd, reducer)
            train_loss = reduce_variable(train_loss, sgd, reducer)

        # Output the Training Loss and Accuracy
        if ((DIST == False) or (sgd.rank_in_global == 0)):
            print('Training loss = %f, training accuracy = %f' %
                  (train_loss, train_correct /
                   (num_train_batch * batch_size * world_size)),
                  flush=True)

        # Evaluation Phase
        autograd.training = False
        for b in range(num_test_batch):
            x = test_x[b * batch_size:(b + 1) * batch_size]
            y = test_y[b * batch_size:(b + 1) * batch_size]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out_test = model.forward(tx)
            test_correct += accuracy(tensor.to_numpy(out_test), y)

        if DIST:
            # Reduce the Evaulation Accuracy from Multiple Devices
            test_correct = reduce_variable(test_correct, sgd, reducer)

        # Output the Evaluation Accuracy
        if ((DIST == False) or (sgd.rank_in_global == 0)):
            print('Evaluation accuracy = %f, Elapsed Time = %fs' %
                  (test_correct / (num_test_batch * batch_size * world_size),
                   time.time() - start_time),
                  flush=True)
예제 #15
0
    def backward_and_partial_update(self, loss, threshold=2097152):
        """Performs backward propagation from the loss and parameter update using asychronous training.

        THIS IS A EXPERIMENTAL FUNCTION FOR RESEARCH PURPOSE:
        From the loss, it performs backward propagation to get the gradients and do the parameter 
        update. It fuses the tensors smaller than the threshold value to reduce network latency, 
        as well as performing asychronous training where one parameter partition is all-reduced
        per iteration. The size of the parameter partition depends on the threshold value.

        Args:
                loss(Tensor): loss is the objective function of the deep learning model 
                optimization, e.g. for classification problem it can be the output of the
                softmax_cross_entropy function.
                threshold(int): threshold is a parameter to control performance in fusing
                the tensors. For the tensors of sizes smaller than threshold, they are to
                be accumulated and fused before the all reduce operation. For the tensors 
                of its size larger than the threshold value, they are to be reduced directly
                without fusion.

        Attributes:
                self.partial(int): A counter to determine which partition to perform all-reduce.
                This counter resets to zero automatlly after an update cycle of the full parameter
                set.
        """
        if not hasattr(self, "partial"):
            self.partial = 0
        self.partial += 1
        k = 0
        plist = []
        acc = 0
        tenlist = []
        reduced = []
        for p, g in autograd.backward(loss):
            # every parameters update locally
            self.opt.update(p, g)
            # then do the partial parameter sychronization
            if p.size() > threshold:
                # larger than threshold -> reduced directly
                # k is the partition number of the full gradient set
                k += 1
                if (k == self.partial):
                    self.all_reduce(p.data)
                    reduced.append(p)
            else:
                # smaller than threshold -> accumulate
                plist.append(p.data)
                tenlist.append(p)
                acc += p.size()
                if (acc > threshold):
                    k += 1
                    if (k == self.partial):
                        self.fused_all_reduce(plist)
                        reduced = tenlist
                    acc = 0
                    plist = []
                    tenlist = []
        if plist:
            k += 1
            if (k == self.partial):
                self.fused_all_reduce(plist)
                reduced = tenlist
        self.wait()
        # the all-reduced parameters needed to be averaged
        for r in reduced:
            r /= self.world_size
        # the counter returns to zero after a cycle of partial update
        if (k == self.partial):
            self.partial = 0
예제 #16
0
    print('train_label_shape:', label.shape)

    inputs = Tensor(data=data)
    target = Tensor(data=label)

    w0 = Tensor(shape=(2, 3), requires_grad=True, stores_grad=True)
    w0.gaussian(0.0, 0.1)
    b0 = Tensor(shape=(1, 3), requires_grad=True, stores_grad=True)
    b0.set_value(0.0)

    w1 = Tensor(shape=(3, 2), requires_grad=True, stores_grad=True)
    w1.gaussian(0.0, 0.1)
    b1 = Tensor(shape=(1, 2), requires_grad=True, stores_grad=True)
    b1.set_value(0.0)

    sgd = optimizer.SGD(0.05)
    # training process
    for i in range(1001):
        x = autograd.matmul(inputs, w0)
        x = autograd.add_bias(x, b0)
        x = autograd.relu(x)
        x = autograd.matmul(x, w1)
        x = autograd.add_bias(x, b1)
        x = autograd.softmax(x)
        loss = autograd.cross_entropy(x, target)
        for p, gp in autograd.backward(loss):
            sgd.apply(0, gp, p, '')

        if (i % 100 == 0):
            print('training loss = ', tensor.to_numpy(loss)[0])
예제 #17
0
 def call(self, loss):
     for p, g in autograd.backward(loss):
         if p.name is None:
             p.name = id(p)
         self.apply(p.name, p, g)