def test_fetch(self): a = jt.array([1, 2, 3]) a = a * 2 v = [] jt.fetch([a], lambda a: v.append(a)) jt.sync_all(True) assert len(v) == 1 and (v[0] == [2, 4, 6]).all()
def test_fetch(self): a = jt.array([1, 2, 3]) a = a * 2 v = [] jt.fetch(a, lambda a: v.append(a)) jt.fetch( 1, 2, 3, a, lambda x, y, z, a: self.assertTrue( x == 1 and y == 2 and z == 3 and isinstance(a, np.ndarray))) jt.sync_all(True) assert len(v) == 1 and (v[0] == [2, 4, 6]).all()
def train(model, dataloader, optimizer, epoch, iteration): # switch to train mode model.train() averMeters.clear() end = time.time() for i, inputs in enumerate(dataloader): for k, v in inputs.items(): print(type(v[0])) averMeters['data_time'].update(time.time() - end) iteration += 1 lr = adjust_learning_rate(optimizer, iteration, BASE_LR=0.0002, WARM_UP_FACTOR=1.0 / 3, WARM_UP_ITERS=1000, STEPS=(0, 14150 * 15, 14150 * 20), GAMMA=0.1) # forward outputs = model(**inputs) # loss loss = outputs jt.sync_all() # backward jt.fetch(averMeters['loss'], loss, lambda a, l: a.update(l.data[0])) #averMeters['loss'].update(loss.data.item()) optimizer.step(loss) # measure elapsed time averMeters['batch_time'].update(time.time() - end) end = time.time() if i % 10 == 0: logger.info('Epoch: [{0}][{1}/{2}]\t' 'Lr: [{3:.8f}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'loss {loss.val:.5f} ({loss.avg:.5f})\t'.format( epoch, i, len(dataloader), lr, batch_time=averMeters['batch_time'], data_time=averMeters['data_time'], loss=averMeters['loss'])) if i % 10000 == 0: model.save(os.path.join(SNAPSHOTDIR, '%d_%d.pkl' % (epoch, i))) model.save(os.path.join(SNAPSHOTDIR, 'last.pkl')) return iteration
def test_densenet(self): self.setup_seed(1) loss_list = [] acc_list = [] mnist_net = MnistNet() global prev prev = time.time() SGD = nn.SGD(mnist_net.parameters(), self.learning_rate, self.momentum, self.weight_decay) # SGD = jt.optim.Adam(mnist_net.parameters(), lr=0.0001) for batch_idx, (data, target) in enumerate(self.train_loader): output = mnist_net(data) loss = nn.cross_entropy_loss(output, target) SGD.step(loss) def callback(batch_idx, loss, output, target): # print train info global prev pred = np.argmax(output, axis=1) acc = np.mean(target == pred) loss_list.append(loss[0]) acc_list.append(acc) print( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAcc: {:.6f} \tTime:{:.3f}' .format(0, batch_idx, 600, 1. * batch_idx / 6.0, loss[0], acc, time.time() - prev)) # prev = time.time() jt.fetch(batch_idx, loss, output, target, callback) # Train Epoch: 0 [0/600 (0%)] Loss: 2.402650 Acc: 0.060000 # Train Epoch: 0 [1/600 (0%)] Loss: 2.770145 Acc: 0.100000 # Train Epoch: 0 [2/600 (0%)] Loss: 3.528072 Acc: 0.100000 # Train Epoch: 0 [3/600 (0%)] Loss: 2.992042 Acc: 0.100000 # Train Epoch: 0 [4/600 (1%)] Loss: 4.672772 Acc: 0.060000 # Train Epoch: 0 [5/600 (1%)] Loss: 5.003410 Acc: 0.080000 # Train Epoch: 0 [6/600 (1%)] Loss: 5.417546 Acc: 0.100000 # Train Epoch: 0 [7/600 (1%)] Loss: 5.137665 Acc: 0.100000 # Train Epoch: 0 [8/600 (1%)] Loss: 5.241075 Acc: 0.070000 # Train Epoch: 0 [9/600 (2%)] Loss: 4.515363 Acc: 0.100000 # Train Epoch: 0 [10/600 (2%)] Loss: 3.357187 Acc: 0.170000 # Train Epoch: 0 [20/600 (3%)] Loss: 2.265879 Acc: 0.100000 # Train Epoch: 0 [30/600 (5%)] Loss: 2.107000 Acc: 0.250000 # Train Epoch: 0 [40/600 (7%)] Loss: 1.918214 Acc: 0.290000 # Train Epoch: 0 [50/600 (8%)] Loss: 1.645694 Acc: 0.400000 jt.sync_all(True) assert np.mean(loss_list[-50:]) < 0.3 assert np.mean(acc_list[-50:]) > 0.9
def test_vgg(self): self.setup_seed(1) loss_list = [] acc_list = [] mnist_net = MnistNet() SGD = nn.SGD(mnist_net.parameters(), self.learning_rate, self.momentum, self.weight_decay) for batch_idx, (data, target) in enumerate(self.train_loader): output = mnist_net(data) loss = nn.cross_entropy_loss(output, target) # train step with jt.log_capture_scope( log_silent=1, log_v=1, log_vprefix="op.cc=100,exe=10", ) as logs: SGD.step(loss) def callback(loss, output, target, batch_idx): # print train info pred = np.argmax(output, axis=1) acc = np.sum(target == pred) / self.batch_size loss_list.append(loss[0]) acc_list.append(acc) print( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAcc: {:.6f}' .format(0, batch_idx, 100, 1. * batch_idx, loss[0], acc)) jt.fetch(batch_idx, loss, output, target, callback) log_conv = find_log_with_re( logs, "Jit op key (not )?found: ((mkl)|(cudnn))_conv.*") log_matmul = find_log_with_re( logs, "Jit op key (not )?found: ((mkl)|(cublas))_matmul.*") if batch_idx: assert len(log_conv) == 38 and len(log_matmul) == 12, ( len(log_conv), len(log_matmul)) mem_used = jt.flags.stat_allocator_total_alloc_byte \ -jt.flags.stat_allocator_total_free_byte assert mem_used < 11e9, mem_used assert jt.core.number_of_lived_vars() < 3500 if (np.mean(loss_list[-50:]) < 0.2): break assert np.mean(loss_list[-50:]) < 0.2
def test_resnet(self): self.setup_seed(1) loss_list=[] acc_list=[] mnist_net = MnistNet() global prev prev = time.time() SGD = nn.SGD(mnist_net.parameters(), self.learning_rate, self.momentum, self.weight_decay) iters = 10 for batch_idx, (data, target) in enumerate(self.train_loader): if (batch_idx > iters): break jt.display_memory_info() output = mnist_net(data) loss = nn.cross_entropy_loss(output, target) SGD.step(loss) def callback(batch_idx, loss, output, target): global prev pred = np.argmax(output, axis=1) acc = np.mean(target==pred) loss_list.append(loss[0]) acc_list.append(acc) print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAcc: {:.6f} \tTime:{:.3f}' .format(0, batch_idx, iters,1. * batch_idx / 6.0, loss[0], acc, time.time()-prev)) jt.fetch(batch_idx, loss, output, target, callback) jt.sync_all(True) jt.display_max_memory_info() _, out = jt.get_max_memory_treemap() out_ = out.split('\n') assert(out_[0] == 'root()') assert(out_[3].endswith('(_run_module_as_main)')) assert(out_[7].endswith('(_run_code)')) _, out = jt.get_max_memory_treemap(build_by=1) out_ = out.split('\n') assert(out_[0] == 'root()') assert(out_[4].endswith('(_run_module_as_main)')) assert(out_[8].endswith('(_run_code)'))
def prep_benchmark(dets_out, h, w): with timer.env('Postprocess'): t = postprocess(dets_out, w, h, crop_masks=args.crop, score_threshold=args.score_threshold) result = {} with timer.env('Copy'): classes, scores, boxes, masks = [x[:args.top_k] for x in t] if isinstance(scores, list): box_scores = scores[0] #.numpy() mask_scores = scores[1] #.numpy() jt.fetch( box_scores, lambda box_scores: result.update({'box_scores': box_scores})) jt.fetch( mask_scores, lambda mask_scores: result.update( {'mask_scores': mask_scores})) else: # scores = scores#.numpy() jt.fetch(scores, lambda scores: result.update({'scores': scores})) # classes = classes#.numpy() # boxes = boxes#.numpy() # masks = masks#.numpy() jt.fetch(classes, lambda classes: result.update({'classes': classes})) jt.fetch(boxes, lambda boxes: result.update({'boxes': boxes})) jt.fetch(masks, lambda masks: result.update({'masks': masks})) with timer.env('Sync'): # Just in case jt.sync_all()
def test_resnet(self): self.setup_seed(1) loss_list = [] acc_list = [] mnist_net = MnistNet() global prev prev = time.time() SGD = nn.SGD(mnist_net.parameters(), self.learning_rate, self.momentum, self.weight_decay) self.train_loader.endless = True for data, target in self.train_loader: batch_id = self.train_loader.batch_id epoch_id = self.train_loader.epoch_id # train step # with jt.log_capture_scope( # log_silent=1, # log_v=1, log_vprefix="op.cc=100,exe=10", # ) as logs: output = mnist_net(data) loss = nn.cross_entropy_loss(output, target) SGD.step(loss) def callback(epoch_id, batch_id, loss, output, target): # print train info global prev pred = np.argmax(output, axis=1) acc = np.mean(target == pred) loss_list.append(loss[0]) acc_list.append(acc) print( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAcc: {:.6f} \tTime:{:.3f}' .format(epoch_id, batch_id, 600, 1. * batch_id / 6.0, loss[0], acc, time.time() - prev)) # prev = time.time() jt.fetch(epoch_id, batch_id, loss, output, target, callback) # log_conv = find_log_with_re(logs, # "Jit op key (not )?found: ((mkl)|(cudnn))_conv.*") # log_matmul = find_log_with_re(logs, # "Jit op key (not )?found: ((mkl)|(cublas))_matmul.*") # if batch_id > 2: # assert len(log_conv)==59 and len(log_matmul)==6, (len(log_conv), len(log_matmul)) mem_used = jt.flags.stat_allocator_total_alloc_byte \ -jt.flags.stat_allocator_total_free_byte # assert mem_used < 4e9, mem_used # TODO: why bigger? assert mem_used < 5.6e9, mem_used # example log: # Train Epoch: 0 [0/100 (0%)] Loss: 2.352903 Acc: 0.110000 # Train Epoch: 0 [1/100 (1%)] Loss: 2.840830 Acc: 0.080000 # Train Epoch: 0 [2/100 (2%)] Loss: 3.473594 Acc: 0.100000 # Train Epoch: 0 [3/100 (3%)] Loss: 3.131615 Acc: 0.200000 # Train Epoch: 0 [4/100 (4%)] Loss: 2.524094 Acc: 0.230000 # Train Epoch: 0 [5/100 (5%)] Loss: 7.780025 Acc: 0.080000 # Train Epoch: 0 [6/100 (6%)] Loss: 3.890721 Acc: 0.160000 # Train Epoch: 0 [7/100 (7%)] Loss: 6.370137 Acc: 0.140000 # Train Epoch: 0 [8/100 (8%)] Loss: 11.390827 Acc: 0.150000 # Train Epoch: 0 [9/100 (9%)] Loss: 21.598564 Acc: 0.080000 # Train Epoch: 0 [10/100 (10%)] Loss: 23.369165 Acc: 0.130000 # Train Epoch: 0 [20/100 (20%)] Loss: 4.804510 Acc: 0.100000 # Train Epoch: 0 [30/100 (30%)] Loss: 3.393924 Acc: 0.110000 # Train Epoch: 0 [40/100 (40%)] Loss: 2.286762 Acc: 0.130000 # Train Epoch: 0 [50/100 (50%)] Loss: 2.055014 Acc: 0.290000 if jt.in_mpi: assert jt.core.number_of_lived_vars( ) < 8100, jt.core.number_of_lived_vars() else: assert jt.core.number_of_lived_vars( ) < 7000, jt.core.number_of_lived_vars() if self.train_loader.epoch_id >= 2: break jt.sync_all(True) assert np.mean(loss_list[-50:]) < 0.5 assert np.mean(acc_list[-50:]) > 0.8