Пример #1
0
def benchmark(trainer):
    # Benchmark to achieve the backward time per layer
    p = Profiling(trainer.net)
    # Warmup
    input_shape, output_shape = trainer.get_data_shape()
    warmup = 5 # warmup should be 0 on some GPUs (e.g., P102-100)
    iteration = 50

    for i in range(iteration+warmup):
        data = trainer.data_iter()

        if trainer.dataset == 'an4':
            inputs, labels_cpu, input_percentages, target_sizes = data
            input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
        else:
            inputs, labels_cpu = data
        if trainer.is_cuda:
            if trainer.dnn == 'lstm' :
                inputs = Variable(inputs.transpose(0, 1).contiguous()).cuda()
                labels = Variable(labels_cpu.transpose(0, 1).contiguous()).cuda()
            else:
                inputs, labels = inputs.cuda(non_blocking=True), labels_cpu.cuda(non_blocking=True)
        else:
            labels = labels_cpu

        if trainer.dnn == 'lstman4':
            out, output_sizes = trainer.net(inputs, input_sizes)
            out = out.transpose(0, 1)  # TxNxH
            loss = trainer.criterion(out, labels_cpu, output_sizes, target_sizes)
            torch.cuda.synchronize()
            loss = loss / inputs.size(0)  # average the loss by minibatch
        elif trainer.dnn == 'lstm' :
            hidden = trainer.net.init_hidden()
            hidden = lstmpy.repackage_hidden(hidden)
            #print(inputs.size(), hidden[0].size(), hidden[1].size())
            outputs, hidden = trainer.net(inputs, hidden)
            tt = torch.squeeze(labels.view(-1, trainer.net.batch_size * trainer.net.num_steps))
            loss = trainer.criterion(outputs.view(-1, trainer.net.vocab_size), tt)
            torch.cuda.synchronize()
        else:
            # forward + backward + optimize
            outputs = trainer.net(inputs)
            loss = trainer.criterion(outputs, labels)
            torch.cuda.synchronize()

        if i >= warmup:
            p.start()
        loss.backward()
        if trainer.is_cuda:
            torch.cuda.synchronize()
    layerwise_times, sum_total = p.get_layerwise_times()
    seq_keys = p.get_backward_seq_keys()
    p.stop()
    return seq_keys[::-1], layerwise_times[::-1], p.get_backward_key_sizes()[::-1]
Пример #2
0
    def test(self, epoch):
        self.net.eval()
        test_loss = 0
        correct = 0
        top1_acc = []
        top5_acc = []
        total = 0
        total_steps = 0
        costs = 0.0
        total_iters = 0
        total_wer = 0
        for batch_idx, data in enumerate(self.testloader):

            if self.dataset == 'an4':
                inputs, labels_cpu, input_percentages, target_sizes = data
                input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
            else:
                inputs, labels_cpu = data
            if self.is_cuda:
                if self.dnn == 'lstm' :
                    inputs = Variable(inputs.transpose(0, 1).contiguous()).cuda()
                    labels = Variable(labels_cpu.transpose(0, 1).contiguous()).cuda()
                else:
                    inputs, labels = inputs.cuda(non_blocking=True), labels_cpu.cuda(non_blocking=True)
            else:
                labels = labels_cpu

            if self.dnn == 'lstm' :
                hidden = self.net.init_hidden()
                hidden = lstmpy.repackage_hidden(hidden)
                outputs, hidden = self.net(inputs, hidden)
                tt = torch.squeeze(labels.view(-1, self.net.batch_size * self.net.num_steps))
                loss = self.criterion(outputs.view(-1, self.net.vocab_size), tt)
                test_loss += loss.data[0]
                costs += loss.data[0] * self.net.num_steps
                total_steps += self.net.num_steps
            elif self.dnn == 'lstman4':
                targets = labels_cpu
                split_targets = []
                offset = 0
                for size in target_sizes:
                    split_targets.append(targets[offset:offset + size])
                    offset += size

                out, output_sizes = self.net(inputs, input_sizes)
                decoded_output, _ = self.decoder.decode(out.data, output_sizes)

                target_strings = self.decoder.convert_to_strings(split_targets)

                wer, cer = 0, 0
                target_strings = self.decoder.convert_to_strings(split_targets)
                wer, cer = 0, 0
                for x in range(len(target_strings)):
                    transcript, reference = decoded_output[x][0], target_strings[x][0]
                    wer += self.decoder.wer(transcript, reference) / float(len(reference.split()))
                total_wer += wer

            else:
                outputs = self.net(inputs)
                loss = self.criterion(outputs, labels)

                acc1, acc5 = self.cal_accuracy(outputs, labels, topk=(1, 5))
                top1_acc.append(float(acc1))
                top5_acc.append(float(acc5))

                test_loss += loss.data.item()
            total += labels.size(0)
            total_iters += 1
        test_loss /= total_iters
        if self.dnn not in ['lstm', 'lstman4']:
            acc = np.mean(top1_acc)
            acc5 = np.mean(top5_acc)
        elif self.dnn == 'lstm':
            acc = np.exp(costs / total_steps)
            acc5 = 0.0
        elif self.dnn == 'lstman4':
            wer = total_wer / len(self.testloader.dataset)
            acc = wer
            acc5 = 0.0
        loss = float(test_loss)/total
        logger.info('Epoch %d, lr: %f, val loss: %f, val top-1 acc: %f, top-5 acc: %f' % (epoch, self.lr, test_loss, acc, acc5))
        self.net.train()
        return acc
Пример #3
0
    def train(self, num_of_iters=1, data=None, hidden=None):
        self.loss = 0.0
        s = time.time()
        # zero the parameter gradients
        #self.optimizer.zero_grad()
        for i in range(num_of_iters):
            self.adjust_learning_rate(self.train_epoch, self.optimizer)
            if self.train_iter % self.num_batches_per_epoch == 0 and self.train_iter > 0:
                self.train_epoch += 1
                logger.info('train iter: %d, num_batches_per_epoch: %d', self.train_iter, self.num_batches_per_epoch)
                logger.info('Epoch %d, avg train acc: %f, lr: %f, avg loss: %f' % (self.train_iter//self.num_batches_per_epoch, np.mean(self.train_acc_top1), self.lr, self.avg_loss_per_epoch/self.num_batches_per_epoch))

                if self.rank == 0 and self.writer is not None:
                    self.writer.add_scalar('cross_entropy', self.avg_loss_per_epoch/self.num_batches_per_epoch, self.train_epoch)
                    self.writer.add_scalar('top-1_acc', np.mean(self.train_acc_top1), self.train_epoch)
                if self.rank == 0:
                    self.test(self.train_epoch)
                self.sparsities = []
                self.compression_ratios = []
                self.communication_sizes = []
                self.train_acc_top1 = []
                self.epochs_info.append(self.avg_loss_per_epoch/self.num_batches_per_epoch)
                self.avg_loss_per_epoch = 0.0

                # Save checkpoint
                if self.train_iter > 0 and self.rank == 0:
                    state = {'iter': self.train_iter, 'epoch': self.train_epoch, 'state': self.get_model_state()}
                    if self.prefix:
                        relative_path = './weights/%s/%s-n%d-bs%d-lr%.4f' % (self.prefix, self.dnn, self.nworkers, self.batch_size, self.base_lr)
                    else:
                        relative_path = './weights/%s-n%d-bs%d-lr%.4f' % (self.dnn, self.nworkers, self.batch_size, self.base_lr)
                    utils.create_path(relative_path)
                    filename = '%s-rank%d-epoch%d.pth'%(self.dnn, self.rank, self.train_epoch)
                    fn = os.path.join(relative_path, filename)
                    if self.train_epoch % 2== 0:
                        self.save_checkpoint(state, fn)
                        self.remove_dict(state)
                if self.train_sampler and (self.nworkers > 1):
                    self.train_sampler.set_epoch(self.train_epoch)

            ss = time.time()
            if data is None:
                data = self.data_iter()

            if self.dataset == 'an4':
                inputs, labels_cpu, input_percentages, target_sizes = data
                input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
            else:
                inputs, labels_cpu = data
            if self.is_cuda:
                if self.dnn == 'lstm' :
                    inputs = Variable(inputs.transpose(0, 1).contiguous()).cuda()
                    labels = Variable(labels_cpu.transpose(0, 1).contiguous()).cuda()
                else:
                    inputs, labels = inputs.cuda(non_blocking=True), labels_cpu.cuda(non_blocking=True)
            else:
                labels = labels_cpu
                
            self.iotime += (time.time() - ss)
            
            sforward = time.time()
            if self.dnn == 'lstman4':
                out, output_sizes = self.net(inputs, input_sizes)
                out = out.transpose(0, 1)  # TxNxH
                loss = self.criterion(out, labels_cpu, output_sizes, target_sizes)
                #torch.cuda.synchronize()
                self.forwardtime += (time.time() - sforward)
                loss = loss / inputs.size(0)  # average the loss by minibatch
            elif self.dnn == 'lstm' :
                hidden = lstmpy.repackage_hidden(hidden)
                outputs, hidden = self.net(inputs, hidden)
                tt = torch.squeeze(labels.view(-1, self.net.batch_size * self.net.num_steps))
                loss = self.criterion(outputs.view(-1, self.net.vocab_size), tt)
                #torch.cuda.synchronize()
                self.forwardtime += (time.time() - sforward)
            else:
                # forward + backward + optimize
                outputs = self.net(inputs)
                loss = self.criterion(outputs, labels)
                #torch.cuda.synchronize()
                self.forwardtime += (time.time() - sforward)
            sbackward = time.time()
            if self.amp_handle is not None:
                with apex.amp.scale_loss(loss, self.optimizer) as scaled_loss:
                    scaled_loss.backward()
                    loss = scaled_loss
            else:
                loss.backward()
            loss_value = loss.item()
            #torch.cuda.synchronize()
            self.backwardtime += (time.time() - sbackward)

            self.loss += loss_value 

            self.avg_loss_per_epoch += loss_value

            if self.dnn not in ['lstm', 'lstman4']:
                acc1, = self.cal_accuracy(outputs, labels, topk=(1,))
                self.train_acc_top1.append(float(acc1))
                
            self.train_iter += 1
        self.num_of_updates_during_comm += 1
        self.loss /= num_of_iters 
        self.timer += time.time() - s 
        display = 40
        if self.train_iter % display == 0:
            logger.warn('[%3d][%5d/%5d][rank:%d] loss: %.3f, average forward (%f) and backward (%f) time: %f, iotime: %f ' %
                  (self.train_epoch, self.train_iter, self.num_batches_per_epoch, self.rank,  self.loss, self.forwardtime/display, self.backwardtime/display, self.timer/display, self.iotime/display))
            self.timer = 0.0
            self.iotime = 0.0
            self.forwardtime = 0.0
            self.backwardtime = 0.0
            
        if self.dnn == 'lstm':
            return num_of_iters, hidden
        return num_of_iters
Пример #4
0
    def train(self, num_of_iters=1, data=None, hidden=None):
        self.loss = 0.0
        s = time.time()

        for i in range(num_of_iters):
            self.adjust_learning_rate(self.train_epoch, self.optimizer)
            if self.train_iter % self.num_batches_per_epoch == 0 and self.train_iter > 0:
                logger.info('train iter: %d, num_batches_per_epoch: %d',
                            self.train_iter, self.num_batches_per_epoch)
                logger.info(
                    'Epoch %d, avg train acc: %f, lr: %f, avg loss: %f' %
                    (self.train_iter // self.num_batches_per_epoch,
                     np.mean(self.train_acc_top1), self.lr,
                     self.avg_loss_per_epoch / self.num_batches_per_epoch))
                mean_s = np.mean(self.sparsities)
                if self.train_iter > 0 and np.isnan(mean_s):
                    logger.warn('NaN detected! sparsities:  %s' %
                                self.sparsities)
                logger.info(
                    'Average Sparsity: %f, compression ratio: %f, communication size: %f',
                    np.mean(self.sparsities), np.mean(self.compression_ratios),
                    np.mean(self.communication_sizes))
                if self.rank == 0 and self.writer is not None:
                    self.writer.add_scalar(
                        'cross_entropy',
                        self.avg_loss_per_epoch / self.num_batches_per_epoch,
                        self.train_epoch)
                    self.writer.add_scalar('top-1 acc',
                                           np.mean(self.train_acc_top1),
                                           self.train_epoch)
                if self.rank == 0:
                    self.test(self.train_epoch)
                self.sparsities = []
                self.compression_ratios = []
                self.communication_sizes = []
                self.train_acc_top1 = []
                self.epochs_info.append(self.avg_loss_per_epoch /
                                        self.num_batches_per_epoch)
                self.avg_loss_per_epoch = 0.0
                if self.train_iter > 0 and self.rank == 0:
                    state = {
                        'iter': self.train_iter,
                        'epoch': self.train_epoch,
                        'state': self.get_model_state()
                    }
                    if self.prefix:
                        relative_path = './weights/%s/%s-n%d-bs%d-lr%.4f' % (
                            self.prefix, self.dnn, self.nworkers,
                            self.batch_size, self.base_lr)
                    else:
                        relative_path = './weights/%s-n%d-bs%d-lr%.4f' % (
                            self.dnn, self.nworkers, self.batch_size,
                            self.base_lr)
                    if settings.SPARSE:
                        relative_path += '-s%.5f' % self.sparsity
                    utils.create_path(relative_path)
                    filename = '%s-rank%d-epoch%d.pth' % (self.dnn, self.rank,
                                                          self.train_epoch)
                    fn = os.path.join(relative_path, filename)
                    #self.save_checkpoint(state, fn)
                    #self.remove_dict(state)
                self.train_epoch += 1
                if self.train_sampler and (self.nworkers > 1):
                    self.train_sampler.set_epoch(self.train_epoch)

            ss = time.time()
            if data is None:
                data = self.data_iter()

            if self.dataset == 'an4':
                inputs, labels_cpu, input_percentages, target_sizes = data
                input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
            else:
                inputs, labels_cpu = data
            if self.is_cuda:
                if self.dnn == 'lstm':
                    inputs = Variable(inputs.transpose(0,
                                                       1).contiguous()).cuda()
                    labels = Variable(labels_cpu.transpose(
                        0, 1).contiguous()).cuda()
                else:
                    inputs, labels = inputs.cuda(
                        non_blocking=True), labels_cpu.cuda(non_blocking=True)
            else:
                labels = labels_cpu

            self.iotime += (time.time() - ss)

            if self.dnn == 'lstman4':
                out, output_sizes = self.net(inputs, input_sizes)
                out = out.transpose(0, 1)  # TxNxH
                loss = self.criterion(out, labels_cpu, output_sizes,
                                      target_sizes)
                loss = loss / inputs.size(0)  # average the loss by minibatch
                loss.backward()
            elif self.dnn == 'lstm':
                hidden = lstmpy.repackage_hidden(hidden)
                outputs, hidden = self.net(inputs, hidden)
                tt = torch.squeeze(
                    labels.view(-1, self.net.batch_size * self.net.num_steps))
                loss = self.criterion(outputs.view(-1, self.net.vocab_size),
                                      tt)
                loss.backward()
            else:
                # forward + backward + optimize
                outputs = self.net(inputs)
                loss = self.criterion(outputs, labels)
                loss.backward()
            loss_value = loss.item()
            # logger.info statistics
            self.loss += loss_value

            self.avg_loss_per_epoch += loss_value

            if self.dnn not in ['lstm', 'lstman4']:
                acc1, = self.cal_accuracy(outputs, labels, topk=(1, ))
                self.train_acc_top1.append(acc1)

            self.train_iter += 1
        self.num_of_updates_during_comm += 1
        self.loss /= num_of_iters
        self.timer += time.time() - s
        display = 100
        if self.train_iter % display == 0:
            logger.info(
                '[%3d][%5d/%5d][rank:%d] loss: %.3f, average forward and backward time: %f, iotime: %f '
                % (self.train_epoch, self.train_iter,
                   self.num_batches_per_epoch, self.rank, self.loss,
                   self.timer / display, self.iotime / display))
            mbytes = 1024. * 1024
            logger.info(
                'GPU memory usage memory_allocated: %d MBytes, max_memory_allocated: %d MBytes, memory_cached: %d MBytes, max_memory_cached: %d MBytes, CPU memory usage: %d MBytes',
                ct.memory_allocated() / mbytes,
                ct.max_memory_allocated() / mbytes,
                ct.memory_cached() / mbytes,
                ct.max_memory_cached() / mbytes,
                process.memory_info().rss / mbytes)
            self.timer = 0.0
            self.iotime = 0.0
            if self.is_cuda:
                torch.cuda.empty_cache()

        if self.dnn == 'lstm':
            return num_of_iters, hidden
        return num_of_iters