def get_coefs(datacfg, darknetcfg, learnetcfg, weightfile): options = read_data_cfg(datacfg) metadict = options['meta'] m = Darknet(darknetcfg, learnetcfg) m.print_network() m.load_weights(weightfile) m.cuda() m.eval() kwargs = {'num_workers': 4, 'pin_memory': True} metaset = dataset.MetaDataset(metafiles=metadict, train=False, ensemble=True, with_ids=True) metaloader = torch.utils.data.DataLoader(metaset, batch_size=64, shuffle=False, **kwargs) n_cls = len(metaset.classes) coef = [[[] for j in range(n_cls)] for i in range(3)] cnt = [0.0] * n_cls print('===> Generating dynamic weights...') kkk = 0 for metax, mask, clsids in metaloader: print('===> {}/{}'.format(kkk, len(metaset) // 64)) kkk += 1 metax, mask = metax.cuda(), mask.cuda() metax, mask = Variable(metax, volatile=True), Variable(mask, volatile=True) dws = m.meta_forward(metax, mask) for ci, c in enumerate(clsids): for i in range(3): coef[i][c].append(dws[i][ci].data.squeeze().cpu().numpy()) return coef
def valid(datacfg, darknetcfg, learnetcfg, weightfile, outfile, traindict, use_baserw=False): options = read_data_cfg(datacfg) valid_images = options['valid'] metadict = traindict #options['meta'] # name_list = options['names'] # backup = cfg.backup ckpt = weightfile.split('/')[-1].split('.')[0] backup = weightfile.split('/')[-2] ckpt_pre = '/ene_' if use_baserw else '/ene' prefix = 'results/' + backup.split('/')[-1] + ckpt_pre + ckpt print('saving to: ' + prefix) # prefix = 'results/' + weightfile.split('/')[1] # names = load_class_names(name_list) with open(valid_images) as fp: tmp_files = fp.readlines() valid_files = [item.rstrip() for item in tmp_files] m = Darknet(darknetcfg, learnetcfg) m.print_network() m.load_weights(weightfile) m.cuda() m.eval() valid_dataset = dataset.listDataset(valid_images, shape=(m.width, m.height), shuffle=False, transform=transforms.Compose([ transforms.ToTensor(), ])) valid_batchsize = 2 assert (valid_batchsize > 1) kwargs = {'num_workers': 4, 'pin_memory': True} valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=valid_batchsize, shuffle=False, **kwargs) if False: metaset = dataset.MetaDataset(metafiles=metadict, train=False, ensemble=True) metaloader = torch.utils.data.DataLoader(metaset, batch_size=len(metaset), shuffle=False, **kwargs) metaloader = iter(metaloader) n_cls = len(metaset.classes) print('===> Generating dynamic weights...') metax, mask = metaloader.next() metax, mask = metax.cuda(), mask.cuda() metax, mask = Variable(metax, volatile=True), Variable(mask, volatile=True) dynamic_weights = m.meta_forward(metax, mask) for i in range(len(dynamic_weights)): assert dynamic_weights[i].size(0) == sum(metaset.meta_cnts) inds = np.cumsum([0] + metaset.meta_cnts) new_weight = [] for j in range(len(metaset.meta_cnts)): new_weight.append( torch.mean(dynamic_weights[i][inds[j]:inds[j + 1]], dim=0)) dynamic_weights[i] = torch.stack(new_weight) print(dynamic_weights[i].shape) else: metaset = dataset.MetaDataset(metafiles=metadict, train=False, ensemble=True, with_ids=True) metaloader = torch.utils.data.DataLoader(metaset, batch_size=64, shuffle=False, **kwargs) # metaloader = iter(metaloader) n_cls = len(metaset.classes) print(n_cls) enews = [0.0] * n_cls cnt = [0.0] * n_cls print('===> Generating dynamic weights...') kkk = 0 # import pdb; pdb.set_trace for metax, mask, clsids in metaloader: print('===> {}/{}'.format(kkk, len(metaset) // 64)) kkk += 1 metax, mask = metax.cuda(), mask.cuda() metax, mask = Variable(metax, volatile=True), Variable(mask, volatile=True) dws = m.meta_forward(metax, mask) dw = dws[0] for ci, c in enumerate(clsids): # print(ci, c, enews[c], cnt[c], dw[ci]) enews[c] = enews[c] * cnt[c] / (cnt[c] + 1) + dw[ci] / (cnt[c] + 1) cnt[c] += 1 dynamic_weights = [torch.stack(enews)] # import pdb; pdb.set_trace() # import pickle # with open('dynamic_weights.pkl', 'wb') as f: # tmp = [x.data.cpu().numpy() for x in dynamic_weights] # pickle.dump(tmp, f) # import pdb; pdb.set_trace() if use_baserw: import pickle # f = 'data/rws/voc_novel{}_.pkl'.format(cfg.novelid) f = 'dynamic_weights.pkl'.format(0) print('===> Loading from {}...'.format(f)) with open(f, 'rb') as f: # with open('data/rws/voc_novel0_.pkl', 'rb') as f: rws = pickle.load(f) dynamic_weights = [ Variable(torch.from_numpy(rw)).cuda() for rw in rws ] # tki = cfg._real_base_ids # for i in range(len(rws)): # dynamic_weights[i][tki] = rws[i][tki] # dynamic_weights[i] = rws[i] if not os.path.exists(prefix): # os.mkdir(prefix) os.makedirs(prefix) fps = [0] * n_cls for i, cls_name in enumerate(metaset.classes): buf = '%s/%s%s.txt' % (prefix, outfile, cls_name) fps[i] = open(buf, 'w') lineId = -1 conf_thresh = 0.005 nms_thresh = 0.45 for batch_idx, (data, target) in enumerate(valid_loader): # import pdb; pdb.set_trace() data = data.cuda() data = Variable(data, volatile=True) output = m.detect_forward(data, dynamic_weights) if isinstance(output, tuple): output = (output[0].data, output[1].data) else: output = output.data # import pdb; pdb.set_trace() batch_boxes = get_region_boxes_v2(output, n_cls, conf_thresh, m.num_classes, m.anchors, m.num_anchors, 0, 1) if isinstance(output, tuple): bs = output[0].size(0) else: assert output.size(0) % n_cls == 0 bs = output.size(0) // n_cls # import pdb; pdb.set_trace() for b in range(bs): lineId = lineId + 1 imgpath = valid_dataset.lines[lineId].rstrip() print(imgpath) imgid = os.path.basename(imgpath).split('.')[0] width, height = get_image_size(imgpath) for i in range(n_cls): # oi = i * bs + b oi = b * n_cls + i boxes = batch_boxes[oi] boxes = nms(boxes, nms_thresh) for box in boxes: x1 = (box[0] - box[2] / 2.0) * width y1 = (box[1] - box[3] / 2.0) * height x2 = (box[0] + box[2] / 2.0) * width y2 = (box[1] + box[3] / 2.0) * height det_conf = box[4] for j in range((len(box) - 5) / 2): cls_conf = box[5 + 2 * j] cls_id = box[6 + 2 * j] prob = det_conf * cls_conf fps[i].write('%s %f %f %f %f %f\n' % (imgid, prob, x1, y1, x2, y2)) for i in range(n_cls): fps[i].close()
def valid(datacfg, darknetcfg, learnetcfg, weightfile, outfile): options = read_data_cfg(datacfg) valid_images = options['valid'] metadict = options['meta'] # name_list = options['names'] # backup = cfg.backup ckpt = weightfile.split('/')[-1].split('.')[0] backup = weightfile.split('/')[-2] prefix = 'results/' + backup.split('/')[-1] + '/e' + ckpt print('saving to: ' + prefix) # prefix = 'results/' + weightfile.split('/')[1] # names = load_class_names(name_list) with open(valid_images) as fp: tmp_files = fp.readlines() valid_files = [item.rstrip() for item in tmp_files] m = Darknet(darknetcfg, learnetcfg) m.print_network() m.load_weights(weightfile) m.cuda() m.eval() torch.set_grad_enabled(False) valid_dataset = dataset.listDataset(valid_images, shape=(m.width, m.height), shuffle=False, transform=transforms.Compose([ transforms.ToTensor(), ])) valid_batchsize = 2 assert(valid_batchsize > 1) kwargs = {'num_workers': 4, 'pin_memory': True} valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=valid_batchsize, shuffle=False, **kwargs) metaset = dataset.MetaDataset(metafiles=metadict, train=False) metaloader = torch.utils.data.DataLoader( metaset, batch_size=metaset.batch_size, shuffle=False, **kwargs ) metaloader = iter(metaloader) n_cls = len(metaset.classes) if not os.path.exists(prefix): # os.mkdir(prefix) os.makedirs(prefix) fps = [0]*n_cls for i, cls_name in enumerate(metaset.classes): buf = '%s/%s%s.txt' % (prefix, outfile, cls_name) fps[i] = open(buf, 'w') lineId = -1 conf_thresh = 0.005 nms_thresh = 0.45 for batch_idx, (data, target) in enumerate(valid_loader): metax, mask = metaloader.next() # print(ids) data = data.cuda() mask = mask.cuda() metax = metax.cuda() data = Variable(data) mask = Variable(mask) metax = Variable(metax) output = m(data, metax, mask) if isinstance(output, tuple): output = (output[0].data, output[1].data) else: output = output.data batch_boxes = get_region_boxes_v2(output, n_cls, conf_thresh, m.num_classes, m.anchors, m.num_anchors, 0, 1) if isinstance(output, tuple): bs = output[0].size(0) else: assert output.size(0) % n_cls == 0 bs = output.size(0) // n_cls for b in range(bs): lineId = lineId + 1 imgpath = valid_dataset.lines[lineId].rstrip() print(imgpath) imgid = os.path.basename(imgpath).split('.')[0] width, height = get_image_size(imgpath) for i in range(n_cls): # oi = i * bs + b oi = b * n_cls + i boxes = batch_boxes[oi] boxes = nms(boxes, nms_thresh) for box in boxes: x1 = (box[0] - box[2]/2.0) * width y1 = (box[1] - box[3]/2.0) * height x2 = (box[0] + box[2]/2.0) * width y2 = (box[1] + box[3]/2.0) * height det_conf = box[4] for j in range((len(box)-5)/2): cls_conf = box[5+2*j] cls_id = box[6+2*j] prob =det_conf * cls_conf fps[i].write('%s %f %f %f %f %f\n' % (imgid, prob, x1, y1, x2, y2)) for i in range(n_cls): fps[i].close()
def train(epoch): global processed_batches t0 = time.time() if ngpus > 1: cur_model = model.module else: cur_model = model train_loader = torch.utils.data.DataLoader(dataset.listDataset( trainlist, shape=(init_width, init_height), shuffle=False, transform=transforms.Compose([ transforms.ToTensor(), ]), train=True, seen=cur_model.seen, batch_size=batch_size, num_workers=num_workers), batch_size=batch_size, shuffle=False, **kwargs) metaset = dataset.MetaDataset(metafiles=metadict, train=True) metaloader = torch.utils.data.DataLoader(metaset, batch_size=metaset.batch_size, shuffle=False, num_workers=num_workers, pin_memory=True) metaloader = iter(metaloader) lr = adjust_learning_rate(optimizer, processed_batches) logging('epoch %d/%d, processed %d samples, lr %f' % (epoch, max_epochs, epoch * len(train_loader.dataset), lr)) model.train() t1 = time.time() avg_time = torch.zeros(9) for batch_idx, (data, target) in enumerate(train_loader): metax, mask = metaloader.next() t2 = time.time() adjust_learning_rate(optimizer, processed_batches) processed_batches = processed_batches + 1 if use_cuda: data = data.cuda() metax = metax.cuda() mask = mask.cuda() #target= target.cuda() t3 = time.time() data, target = Variable(data), Variable(target) metax, mask = Variable(metax), Variable(mask) t4 = time.time() optimizer.zero_grad() t5 = time.time() output = model(data, metax, mask) t6 = time.time() region_loss.seen = region_loss.seen + data.data.size(0) loss = region_loss(output, target) t7 = time.time() loss.backward() t8 = time.time() optimizer.step() t9 = time.time() if False and batch_idx > 1: avg_time[0] = avg_time[0] + (t2 - t1) avg_time[1] = avg_time[1] + (t3 - t2) avg_time[2] = avg_time[2] + (t4 - t3) avg_time[3] = avg_time[3] + (t5 - t4) avg_time[4] = avg_time[4] + (t6 - t5) avg_time[5] = avg_time[5] + (t7 - t6) avg_time[6] = avg_time[6] + (t8 - t7) avg_time[7] = avg_time[7] + (t9 - t8) avg_time[8] = avg_time[8] + (t9 - t1) print('-------------------------------') print(' load data : %f' % (avg_time[0] / (batch_idx))) print(' cpu to cuda : %f' % (avg_time[1] / (batch_idx))) print('cuda to variable : %f' % (avg_time[2] / (batch_idx))) print(' zero_grad : %f' % (avg_time[3] / (batch_idx))) print(' forward feature : %f' % (avg_time[4] / (batch_idx))) print(' forward loss : %f' % (avg_time[5] / (batch_idx))) print(' backward : %f' % (avg_time[6] / (batch_idx))) print(' step : %f' % (avg_time[7] / (batch_idx))) print(' total : %f' % (avg_time[8] / (batch_idx))) t1 = time.time() print('') t1 = time.time() logging('training with %f samples/s' % (len(train_loader.dataset) / (t1 - t0))) if (epoch + 1) % cfg.save_interval == 0: logging('save weights to %s/%06d.weights' % (backupdir, epoch + 1)) cur_model.seen = (epoch + 1) * len(train_loader.dataset) cur_model.save_weights('%s/%06d.weights' % (backupdir, epoch + 1))
print(num_workers) kwargs = {'num_workers': num_workers, 'pin_memory': True} if use_cuda else {} test_loader = torch.utils.data.DataLoader(dataset.listDataset( testlist, shape=(init_width, init_height), shuffle=False, transform=transforms.Compose([ transforms.ToTensor(), ]), train=False), batch_size=batch_size, shuffle=False, **kwargs) test_metaset = dataset.MetaDataset(metafiles=metadict, train=True) test_metaloader = torch.utils.data.DataLoader( test_metaset, batch_size=test_metaset.batch_size, shuffle=False, num_workers=num_workers // 2, pin_memory=True) # Adjust learning rate factor = len(test_metaset.classes) if cfg.neg_ratio == 'full': factor = 15. elif cfg.neg_ratio == 1: factor = 3.0 elif cfg.neg_ratio == 0: factor = 1.5
def valid(datacfg, darknetcfg, learnetcfg, weightfile, outfile, use_baserw=False): options = read_data_cfg(datacfg) valid_images = options['valid'] metadict = options['meta'] # name_list = options['names'] # backup = cfg.backup ckpt = weightfile.split('/')[-1].split('.')[0] backup = weightfile.split('/')[-2] ckpt_pre = '/ene_' if use_baserw else '/ene' prefix = 'results/' + backup.split('/')[-1] + ckpt_pre + ckpt print('saving to: ' + prefix) # prefix = 'results/' + weightfile.split('/')[1] # names = load_class_names(name_list) with open(valid_images) as fp: tmp_files = fp.readlines() valid_files = [item.rstrip() for item in tmp_files] m = Darknet(darknetcfg, learnetcfg) m.print_network() m.load_weights(weightfile) m.cuda() m.eval() kwargs = {'num_workers': 4, 'pin_memory': True} metaset = dataset.MetaDataset(metafiles=metadict, train=False, ensemble=True, with_ids=True) metaloader = torch.utils.data.DataLoader( metaset, batch_size=64, shuffle=False, **kwargs ) # metaloader = iter(metaloader) n_cls = len(metaset.classes) coef = [[[] for j in range(n_cls)] for i in range(3)] cnt = [0.0] * n_cls print('===> Generating dynamic weights...') kkk = 0 for metax, mask, clsids in metaloader: print('===> {}/{}'.format(kkk, len(metaset) // 64)) kkk += 1 metax, mask = metax.cuda(), mask.cuda() metax, mask = Variable(metax, volatile=True), Variable(mask, volatile=True) dws = m.meta_forward(metax, mask) for ci, c in enumerate(clsids): for i in range(3): coef[i][c].append(dws[i][ci]) outfile = './reweight_coef.data' with open(outfile, 'w') as f: for c in range(n_cls): print('processing %s' % classes[c]) f.write(classes[c] + '\n') for i in range(3): f.write('coef%d\n' % i) for dw in coef[i][c]: for n in dw: f.write('%e ' % n.data[0]) f.write('\n')
def train(epoch): global processed_batches ic("Training...") t0 = time.time() if ngpus > 1: cur_model = model.module else: cur_model = model train_loader = torch.utils.data.DataLoader(dataset.listDataset( trainlist, shape=(init_width, init_height), shuffle=False, transform=transforms.Compose([ transforms.ToTensor(), ]), train=True, seen=cur_model.seen, batch_size=actual_bs, num_workers=num_workers), batch_size=actual_bs, shuffle=False, **kwargs) metaset = dataset.MetaDataset(metafiles=metadict, train=True) metaloader = torch.utils.data.DataLoader(metaset, batch_size=metaset.batch_size, shuffle=False, num_workers=num_workers, pin_memory=True) metaloader = iter(metaloader) lr = adjust_learning_rate(optimizer, processed_batches) logging('epoch %d/%d, processed %d samples, lr %.8f' % (epoch, max_epochs, epoch * len(train_loader.dataset), lr)) logging('processed_batches %d' % (processed_batches)) # AA I can't fit batch size 64 on my GPU, so I'm using gradient accumulation # to account for this accumulate_gradients = actual_bs != batch_size ic(accumulate_gradients) # Check that our effective bs is evenly divisible by our actual assert (batch_size % actual_bs == 0) accumulate_step = batch_size // actual_bs ic(accumulate_step) # B/c we're not validating, use loss as the LR scheduler trigger losses = [] model.train() t1 = time.time() avg_time = torch.zeros(9) optimizer.zero_grad() for batch_idx, (data, target) in enumerate(train_loader): #ic("iter") metax, mask = metaloader.next() t2 = time.time() adjust_learning_rate(optimizer, processed_batches) if (batch_idx + 1) % accumulate_step == 0: processed_batches = processed_batches + 1 if use_cuda: data = data.cuda() metax = metax.cuda() mask = mask.cuda() #target= target.cuda() t3 = time.time() data, target = Variable(data), Variable(target) metax, mask = Variable(metax), Variable(mask) t4 = time.time() t5 = time.time() output = model(data, metax, mask) t6 = time.time() region_loss.seen = region_loss.seen + data.data.size(0) loss = region_loss(output, target) t7 = time.time() loss.backward() losses.append(loss.item()) t8 = time.time() if (batch_idx + 1) % accumulate_step == 0: #ic("step") optimizer.step() optimizer.zero_grad() t9 = time.time() if False and batch_idx > 1: avg_time[0] = avg_time[0] + (t2 - t1) avg_time[1] = avg_time[1] + (t3 - t2) avg_time[2] = avg_time[2] + (t4 - t3) avg_time[3] = avg_time[3] + (t5 - t4) avg_time[4] = avg_time[4] + (t6 - t5) avg_time[5] = avg_time[5] + (t7 - t6) avg_time[6] = avg_time[6] + (t8 - t7) avg_time[7] = avg_time[7] + (t9 - t8) avg_time[8] = avg_time[8] + (t9 - t1) print('-------------------------------') print(' load data : %f' % (avg_time[0] / (batch_idx))) print(' cpu to cuda : %f' % (avg_time[1] / (batch_idx))) print('cuda to variable : %f' % (avg_time[2] / (batch_idx))) print(' zero_grad : %f' % (avg_time[3] / (batch_idx))) print(' forward feature : %f' % (avg_time[4] / (batch_idx))) print(' forward loss : %f' % (avg_time[5] / (batch_idx))) print(' backward : %f' % (avg_time[6] / (batch_idx))) print(' step : %f' % (avg_time[7] / (batch_idx))) print(' total : %f' % (avg_time[8] / (batch_idx))) t1 = time.time() print('') t1 = time.time() logging('training with %f samples/s' % (len(train_loader.dataset) / (t1 - t0))) avg_loss = np.mean(losses) logging('Average epoch loss: %f' % avg_loss) scheduler.step(avg_loss) if (epoch + 1) % cfg.save_interval == 0: logging('save weights to %s/%06d.weights' % (backupdir, epoch + 1)) cur_model.seen = (epoch + 1) * len(train_loader.dataset) cur_model.save_weights('%s/%06d.weights' % (backupdir, epoch + 1)) if epoch == max_epochs - 1: print("Writing final model weights") cur_model.save_weights('%s/model_final.weights' % backupdir)
def train(epoch): global processed_batches t0 = time.time() if ngpus > 1: cur_model = model.module else: cur_model = model train_loader = torch.utils.data.DataLoader( dataset.listDataset( trainlist, shape=(init_width, init_height), shuffle=True, transform=transforms.Compose([ # transforms.Resize([448, 448]), transforms.ToTensor(), ]), train=True, seen=cur_model.seen, batch_size=batch_size, num_workers=num_workers), batch_size=batch_size, shuffle=False, **kwargs) # print("block b nw is: ", batch_size, num_workers) metaset = dataset.MetaDataset(metafiles=metadict, train=True, num_workers=num_workers) metaloader = torch.utils.data.DataLoader( metaset, batch_size=metaset.batch_size, # batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True #################### ) # print("meta b nw is: ", batch_size, num_workers) metaloader = iter(metaloader) lr = adjust_learning_rate(optimizer, processed_batches) logging('epoch %d/%d, processed %d samples, lr %f' % (epoch, max_epochs, epoch * len(train_loader.dataset), lr)) model.train() # t1 = time.time() avg_time = torch.zeros(9) _len = len(train_loader) for batch_idx, (data, target) in enumerate(tqdm(train_loader)): # t_data = time.time() metax, mask = metaloader.next() # t2 = time.time() adjust_learning_rate(optimizer, processed_batches) processed_batches = processed_batches + 1 if use_cuda: data = data.cuda() metax = metax.cuda() mask = mask.cuda() #target= target.cuda() # t3 = time.time() data, target = Variable(data), Variable(target) metax, mask = Variable(metax), Variable(mask) # t4 = time.time() optimizer.zero_grad() # t5 = time.time() # print("input data shape: ", [data.shape, metax.shape, mask.shape]) output = model(data.float(), metax.float(), mask.float()) # torch.Size([1, 30, 13, 13]) # t6 = time.time() region_loss.seen = region_loss.seen + data.data.size(0) # ("target shape :", target.shape) loss_total, loss, printout, cur_step = region_loss( output, target.float(), use_cuda) # t7 = time.time() loss_total.backward() # t8 = time.time() optimizer.step() # t9 = time.time() # print(f"t_data:{t_data - t1}, t_meta:{t2 - t_data}") if False and batch_idx > 1: avg_time[0] = avg_time[0] + (t2 - t1) avg_time[1] = avg_time[1] + (t3 - t2) avg_time[2] = avg_time[2] + (t4 - t3) avg_time[3] = avg_time[3] + (t5 - t4) avg_time[4] = avg_time[4] + (t6 - t5) avg_time[5] = avg_time[5] + (t7 - t6) avg_time[6] = avg_time[6] + (t8 - t7) avg_time[7] = avg_time[7] + (t9 - t8) avg_time[8] = avg_time[8] + (t9 - t1) print('-------------------------------') print(' load data : %f' % (avg_time[0] / (batch_idx))) print(' cpu to cuda : %f' % (avg_time[1] / (batch_idx))) print('cuda to variable : %f' % (avg_time[2] / (batch_idx))) print(' zero_grad : %f' % (avg_time[3] / (batch_idx))) print(' forward feature : %f' % (avg_time[4] / (batch_idx))) print(' forward loss : %f' % (avg_time[5] / (batch_idx))) print(' backward : %f' % (avg_time[6] / (batch_idx))) print(' step : %f' % (avg_time[7] / (batch_idx))) print(' total : %f' % (avg_time[8] / (batch_idx))) # t1 = time.time() writer.add_scalar("scalar/trainLoss", loss_total.item(), cur_step) writer.add_scalars( "scalar/separatedLoss", { "loss_conf": loss["loss_conf"].item(), "loss_cls": loss["loss_cls"].item() }, cur_step) writer.add_scalar("scalar/trainLr", lr, cur_step) if batch_idx % 301 == 300: print(str(epoch + 1) + '->' + printout) logging('save weights to %s/%06d_%06d.weights' % (backupdir, epoch + 1, batch_idx)) cur_model.seen = (epoch + 1) * len(train_loader.dataset) cur_model.save_weights('%s/%06d_%06d.weights' % (backupdir, epoch + 1, batch_idx)) print("save checkpoint finished!") # del loss_total, loss, cur_step # torch.cuda.empty_cache() # print('') t1 = time.time() logging('training with %f samples/s' % (len(train_loader.dataset) / (t1 - t0))) if (epoch + 1) % save_interval == 0: # torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(), # 'optimizer': optimizer.state_dict()}, # checkpoint_path + '/m-' + launchTimestamp + '-' + str(epoch+1) + 'epoch-' + str("%.4f" % loss_total.data) + '.pth.tar') logging('save weights to %s/%06d.weights' % (backupdir, epoch + 1)) cur_model.seen = (epoch + 1) * len(train_loader.dataset) cur_model.save_weights('%s/%06d.weights' % (backupdir, epoch + 1)) print("save checkpoint finished!")
def train(epoch): global processed_batches t0 = time.time() if ngpus > 1: cur_model = model.module else: cur_model = model train_loader = torch.utils.data.DataLoader( dataset.listDataset( trainlist, shape=(init_width, init_height), shuffle=False, transform=transforms.Compose([ transforms.ToTensor(), # lambda x: 2 * (x - 0.5) ]), train=True, seen=cur_model.seen, batch_size=batch_size, num_workers=num_workers), batch_size=batch_size, shuffle=False, **kwargs) metaset = dataset.MetaDataset(metafiles=metadict, train=True) metaloader = torch.utils.data.DataLoader(metaset, batch_size=metaset.batch_size, shuffle=False, num_workers=num_workers, pin_memory=True) metaloader = iter(metaloader) lr = adjust_learning_rate(optimizer, processed_batches) logging('epoch %d/%d, processed %d samples, lr %f' % (epoch, max_epochs, epoch * len(train_loader.dataset), lr)) model.train() t1 = time.time() avg_time = torch.zeros(9) device = torch.device('cuda') pbar = tqdm(dynamic_ncols=True, total=int(len(train_loader))) for batch_idx, (data, target) in enumerate(train_loader): metax, mask = metaloader.next() db.printTensor(metax) db.printTensor(data) t2 = time.time() adjust_learning_rate(optimizer, processed_batches) processed_batches = processed_batches + 1 data = data.to(device, non_blocking=True) metax = metax.to(device, non_blocking=True) mask = mask.to(device, non_blocking=True) t3 = time.time() t4 = time.time() optimizer.zero_grad() t5 = time.time() output = model(data, metax, mask) t6 = time.time() region_loss.seen = region_loss.seen + data.data.size(0) del data, metax, mask loss = region_loss(output, target) del output, target t7 = time.time() loss.backward() t8 = time.time() optimizer.step() t9 = time.time() status = '{} :: E: {} / {} :: iter: {} :: lr: {:.1e} :: L: {:.4f} '.format( 'train', epoch, max_epochs, region_loss.seen, lr, loss.item()) pbar.set_description(status, refresh=False) pbar.update(1) del loss if False and batch_idx > 1: avg_time[0] = avg_time[0] + (t2 - t1) avg_time[1] = avg_time[1] + (t3 - t2) avg_time[2] = avg_time[2] + (t4 - t3) avg_time[3] = avg_time[3] + (t5 - t4) avg_time[4] = avg_time[4] + (t6 - t5) avg_time[5] = avg_time[5] + (t7 - t6) avg_time[6] = avg_time[6] + (t8 - t7) avg_time[7] = avg_time[7] + (t9 - t8) avg_time[8] = avg_time[8] + (t9 - t1) print('-------------------------------') print(' load data : %f' % (avg_time[0] / (batch_idx))) print(' cpu to cuda : %f' % (avg_time[1] / (batch_idx))) print('cuda to variable : %f' % (avg_time[2] / (batch_idx))) print(' zero_grad : %f' % (avg_time[3] / (batch_idx))) print(' forward feature : %f' % (avg_time[4] / (batch_idx))) print(' forward loss : %f' % (avg_time[5] / (batch_idx))) print(' backward : %f' % (avg_time[6] / (batch_idx))) print(' step : %f' % (avg_time[7] / (batch_idx))) print(' total : %f' % (avg_time[8] / (batch_idx))) t1 = time.time() pbar.close() print('') t1 = time.time() logging('training with %f samples/s' % (len(train_loader.dataset) / (t1 - t0))) if (epoch + 1) % cfg.save_interval == 0: logging('save weights to %s/%06d.weights' % (backupdir, epoch + 1)) cur_model.seen = (epoch + 1) * len(train_loader.dataset) cur_model.save_weights('%s/%06d.weights' % (backupdir, epoch + 1))