def test(config): """Test point cloud data loader. """ from torch.utils.data import DataLoader from lib.utils import Timer timer = Timer() DatasetClass = StanfordVoxelization2cmDataset transformations = [ t.RandomHorizontalFlip(DatasetClass.ROTATION_AXIS, DatasetClass.IS_TEMPORAL), t.ChromaticAutoContrast(), t.ChromaticTranslation(config.data_aug_color_trans_ratio), t.ChromaticJitter(config.data_aug_color_jitter_std), t.HueSaturationTranslation(config.data_aug_hue_max, config.data_aug_saturation_max), ] dataset = DatasetClass(config, input_transform=t.Compose(transformations), augment_data=True, cache=True, elastic_distortion=True) data_loader = DataLoader( dataset=dataset, collate_fn=t.cfl_collate_fn_factory(limit_numpoints=False), batch_size=4, shuffle=True) # Start from index 1 iter = data_loader.__iter__() for i in range(100): timer.tic() data = iter.next() print(timer.toc())
def __init__(self, net, graph, is_training): self.net = net self.graph = graph self.is_training = is_training self.num_epochs = cfg.TRAIN.NUM_EPOCHS self.train_timer = Timer() self.data_timer = Timer() self.global_step = slim.get_or_create_global_step() # Build basic ops and tensors if self.is_training: self.net.build_train_ops(self.global_step) if isinstance(net, Classifier): self.saver = tf.train.Saver(var_list=net.vars_to_restore, name='saver') else: self.saver = tf.train.Saver(max_to_keep=None, name='saver_all_var') # Save all vars self.init_ops = self.build_init_ops() self.val_loss_ph = tf.placeholder(tf.float32, shape=(), name='val_loss_ph') self.net.build_summary_ops(self.graph) self.val_loss_summary = tf.summary.scalar(name='val_loss', tensor=self.val_loss_ph) print('saver variables:') print_tensor_shapes(net.vars_to_restore, prefix='-->')
def train(lr=0.001, progress_bar=False, fig_dir='./figs', prefix='NET'): loss_all, DCS, P, A, R, loss_epoch = trainer.load_checkpoint(prefix=prefix) timer = Timer(args.checkpoint_timer) for e in trainer.get_range(EPOCHS): model.train() if not DEEPVESSEL else None print('lesn loss all', len(loss_all), 'ken los all one elemtn', type(loss_all)) loss = trainer.train_epoch(lr=lr, progress_bar=progress_bar) mean_loss = np.array(loss).mean() loss_epoch.append(mean_loss) print('EPOCH ', e, 'loss epoch', mean_loss) print('lesn loss all', len(loss_all), 'ken los all one elemtn', type(loss_all)) print('len loss', len(loss), 'lesn loss one element', type(loss)) new_loss = loss_all + loss loss_all = new_loss if DEEPVESSEL: print('Evaluation Epoch {}/{}...'.format(e, EPOCHS)) DCS.append(evaluator.DCM(model, progress_bar=progress_bar)) a, p, r = evaluator.bin_scores(model, progress_bar=progress_bar) P.append(p) A.append(a) R.append(r) print('DCS score:', DCS[-1], 'accuracy ', a, 'precision', p, 'recall', r) else: with torch.no_grad(): print('Evaluation Epoch {}/{}...'.format(e, EPOCHS)) model.eval() DCS.append(evaluator.DCM(model, progress_bar=progress_bar)) a, p, r = evaluator.bin_scores(model, progress_bar=progress_bar) P.append(p) A.append(a) R.append(r) print('DCS score:', DCS[-1], 'accuracy ', a, 'precision', p, 'recall', r) if timer.is_time(): measurements = np.array([DCS, P, A, R, loss_epoch]) trainer.save_model(MODEL_PATH) trainer.save_checkpoint(np.array(loss_all), measurements, prefix, lr, args.dataset, e, EPOCHS, fig_dir, args.upload) loss_all = np.array(loss_all) measurements = np.array([DCS, P, A, R, loss_epoch]) trainer.save_model(MODEL_PATH) trainer.save_checkpoint(loss_all, measurements, prefix, lr, args.dataset, EPOCHS, EPOCHS, fig_dir, args.upload)
def __init__(self,parent=None): super(drQt,self).__init__(parent=parent) try: lib.utils.get_all_jobs() except: raise "NO MASTER FOUND" self.setupUi(self) self._timer_ = Timer(parent=self) self.timer_interrupt = 0 self.jobs_tab_list = [] self.nodes_tab_list = [] self._selected_job_row = None self.setup_main() self.PB_refresh.clicked.connect(self.refresh) self.CB_auto_refresh.stateChanged.connect(self.set_autorefresh) self.connect(self._timer_,QtCore.SIGNAL("time_elapsed"),self.refresh) self.SB_refresh_time.setMinimum(1) self.SB_refresh_time.setValue(2) self.setWindowFlags(QtCore.Qt.Window | QtCore.Qt.WindowMinimizeButtonHint | QtCore.Qt.WindowCloseButtonHint | QtCore.Qt.WindowMaximizeButtonHint) self.LB_header.setPixmap(QtGui.QPixmap(os.path.join(icons_path, "drQHeader.png"))) self.connect(self.TW_job,QtCore.SIGNAL("cellClicked(int,int)"), self._store_selected_job) self.connect(self.TW_job,QtCore.SIGNAL("customContextMenuRequested(QPoint)"), self._create_context)
def test(config, intensity=False): """Test point cloud data loader. """ from torch.utils.data import DataLoader from lib.utils import Timer import open3d as o3d def make_pcd(coords, feats): pcd = o3d.geometry.PointCloud() pcd.points = o3d.utility.Vector3dVector(coords[:, :3].float().numpy()) pcd.colors = o3d.utility.Vector3dVector(feats[:, :3].numpy() / 255) if intensity: pcd.intensities = o3d.utility.Vector3dVector(feats[:, 3:3].numpy()) return pcd timer = Timer() DatasetClass = FacilityArea5Dataset transformations = [ t.RandomHorizontalFlip(DatasetClass.ROTATION_AXIS, DatasetClass.IS_TEMPORAL), t.ChromaticAutoContrast(), t.ChromaticTranslation(config.data_aug_color_trans_ratio), t.ChromaticJitter(config.data_aug_color_jitter_std), ] dataset = DatasetClass(config, prevoxel_transform=t.ElasticDistortion( DatasetClass.ELASTIC_DISTORT_PARAMS), input_transform=t.Compose(transformations), augment_data=True, cache=True, elastic_distortion=True) data_loader = DataLoader( dataset=dataset, collate_fn=t.cfl_collate_fn_factory(limit_numpoints=False), batch_size=1, shuffle=True) # Start from index 1 iter = data_loader.__iter__() for i in range(100): timer.tic() coords, feats, labels = iter.next() pcd = make_pcd(coords, feats) o3d.visualization.draw_geometries([pcd]) print(timer.toc())
def __init__(self, config): self._config = config self._train_config = config['train'] self._model_config = config['model'] self._data_config = config['data'] # the folders for model and tensor board self._training_folder = None self._model_folder = None self._tfb_folder = None # build model self._model = BaseModel.generate_model_from_config(self._model_config) self._model_saver = tf.train.Saver(max_to_keep=0) # other setting self._timer = Timer('m')
def test(self): from torch.utils.data import DataLoader from lib.utils import Timer from config import get_config config = get_config() dataset = SynthiaVoxelizationDataset(config) timer = Timer() data_loader = DataLoader( dataset=dataset, collate_fn=cfl_collate_fn_factory(limit_numpoints=False), num_workers=0, batch_size=4, shuffle=True) # Start from index 1 # for i, batch in enumerate(data_loader, 1): iter = data_loader.__iter__() for i in range(100): timer.tic() batch = iter.next() print(batch, timer.toc())
def __init__(self, log_dir, logger, enabled): self.writer = None self.selected_module = "" if enabled: log_dir = str(log_dir) # Retrieve vizualization writer. succeeded = False for module in ["torch.utils.tensorboard", "tensorboardX"]: try: self.writer = importlib.import_module( module).SummaryWriter(log_dir) succeeded = True break except ImportError: succeeded = False self.selected_module = module if not succeeded: message = "Warning: visualization (Tensorboard) is configured to use, but currently not installed on " \ "this machine. Please install either TensorboardX with 'pip install tensorboardx', upgrade " \ "PyTorch to version >= 1.1 for using 'torch.utils.tensorboard' or turn off the option in " \ "the 'config.json' file." logger.warning(message) self.step = 0 self.mode = '' self.tb_writer_ftns = { 'add_scalar', 'add_scalars', 'add_image', 'add_images', 'add_audio', 'add_text', 'add_histogram', 'add_pr_curve', 'add_embedding' } self.tag_mode_exceptions = {'add_histogram', 'add_embedding'} self.timer = Timer()
class drQt(drQtUI.Ui_MainWindow,QtGui.QMainWindow): node_properties=["Id", "Hostname", "Arch", "OS", "Nbits", "Procspeed", "CPUs", "Cores each CPU", "Memory", "Load average", "Pools"] job_properties=["Id", "Name", "Owner", "Status", "Total tasks", "Tasks left", "Done", "Pool"] def __init__(self,parent=None): super(drQt,self).__init__(parent=parent) try: lib.utils.get_all_jobs() except: raise "NO MASTER FOUND" self.setupUi(self) self._timer_ = Timer(parent=self) self.timer_interrupt = 0 self.jobs_tab_list = [] self.nodes_tab_list = [] self._selected_job_row = None self.setup_main() self.PB_refresh.clicked.connect(self.refresh) self.CB_auto_refresh.stateChanged.connect(self.set_autorefresh) self.connect(self._timer_,QtCore.SIGNAL("time_elapsed"),self.refresh) self.SB_refresh_time.setMinimum(1) self.SB_refresh_time.setValue(2) self.setWindowFlags(QtCore.Qt.Window | QtCore.Qt.WindowMinimizeButtonHint | QtCore.Qt.WindowCloseButtonHint | QtCore.Qt.WindowMaximizeButtonHint) self.LB_header.setPixmap(QtGui.QPixmap(os.path.join(icons_path, "drQHeader.png"))) self.connect(self.TW_job,QtCore.SIGNAL("cellClicked(int,int)"), self._store_selected_job) self.connect(self.TW_job,QtCore.SIGNAL("customContextMenuRequested(QPoint)"), self._create_context) def _raise_new_job(self): log.debug("start new job") newjobD = SendJob(self) newjobD.show() def _raise_about(self): aboutD= AboutDialog(self) aboutD.show() def _store_selected_job(self, row,column): self._selected_job_row = row def setup_menu_bar(self): menu_bar = self.menuBar() job_bar = menu_bar.addMenu("&Job") new_job = QtGui.QAction("&New Job",self) self.connect(new_job, QtCore.SIGNAL('triggered()'), self._raise_new_job) self.connect(self.NewJobButton, QtCore.SIGNAL('clicked()'), self._raise_new_job) job_bar.addAction(new_job) help_bar = menu_bar.addMenu("&Help") About = QtGui.QAction("&About",self) self.connect(About, QtCore.SIGNAL('triggered()'), self._raise_about) help_bar.addAction(About) def setup_main(self): self.setWindowTitle("DrQueue Manager") self.resize(1000,600) self.setup_menu_bar() self.set_main_icons() self.setup_jobs() self.init_jobs_tabs() self.setup_slaves() self.init_slaves_tabs() def setup_slaves(self): self.TW_node.clear() self.TW_node.setColumnCount(len(self.node_properties)) self.TW_node.setHorizontalHeaderLabels(self.node_properties) self.TW_node.verticalHeader().hide() self.TW_node.setAlternatingRowColors(True) self.TW_node.setSelectionBehavior(QtGui.QTableView.SelectRows) self.TW_node.setSelectionMode(QtGui.QTableView.SingleSelection) def setup_jobs(self): self.TW_job.clear() self.TW_job.setColumnCount(len(self.job_properties)) self.TW_job.setHorizontalHeaderLabels(self.job_properties) self.TW_job.verticalHeader().hide() self.TW_job.setAlternatingRowColors(True) self.TW_job.setSelectionBehavior(QtGui.QTableView.SelectRows) self.TW_job.setSelectionMode(QtGui.QTableView.SingleSelection) def refresh(self): self.setCursor(QtCore.Qt.WaitCursor); self.init_jobs_tabs() self.init_slaves_tabs() self.TW_job.repaint() self.TW_node.repaint() if self._selected_job_row != None: log.debug("restore row selection %s" % self._selected_job_row) self.TW_job.setCurrentCell(self._selected_job_row,0) self.setCursor(QtCore.Qt.ArrowCursor); def set_autorefresh(self,status): if status: log.debug("autorefresh:ON") refresh_time = self.SB_refresh_time.value() self._timer_.set_run_time(refresh_time) self._timer_.start() else: log.debug("autorefresh:OFF") self._timer_.terminate() def init_jobs_tabs(self): self.jobs_tab_list=[] log.debug("building job tabs...") self.TW_job.clearContents() jobs = lib.utils.get_all_jobs() num_jobs = len(jobs) log.debug("num jobs %s" % num_jobs) self.TW_job.setRowCount(num_jobs) for i in range(num_jobs): job_tab = JobTab(jobs[i],parent = self.TW_job) job_tab.add_to_table(self.TW_job, i) self.connect(job_tab, QtCore.SIGNAL('update'), self.refresh) self.jobs_tab_list.append(job_tab) def init_slaves_tabs(self): self.nodes_tab_list = [] log.debug("building nodes tabs...") nodes = lib.utils.get_all_slaves() num_nodes = len(nodes) self.TW_node.setRowCount(num_nodes) for i in range(num_nodes): log.debug("create slave Node Tab : %s" % type(nodes[i])) node_tab = SlaveNodeTab(nodes[i],parent = self.TW_node) node_tab.add_to_table(self.TW_node, i) self.connect(node_tab, QtCore.SIGNAL('update'), self.refresh) self.nodes_tab_list.append(node_tab) def set_main_icons(self): self.setWindowIcon(QtGui.QIcon(os.path.join(icons_path, "main.svg"))) self.TW_main.setTabIcon(0,QtGui.QIcon(os.path.join(icons_path, "job.svg"))) self.TW_main.setTabIcon(1,QtGui.QIcon(os.path.join(icons_path, "network-transmit-receive.svg"))) def _create_context(self, QPoint): """ create the context menu """ newAct = QtGui.QAction("&New Job",self) newAct.setToolTip("createa new job") self.connect(newAct, QtCore.SIGNAL('triggered()'), self._new_job_show) menu = QtGui.QMenu("Menu", self) menu.addAction(newAct) menu.exec_(QtGui.QCursor.pos())
def train(self, train_loader, val_loader=None): ''' Given data queues, train the network ''' # Parameter directory save_dir = os.path.join(cfg.DIR.OUT_PATH) if not os.path.exists(save_dir): os.makedirs(save_dir) # Timer for the training op and parallel data loading op. train_timer = Timer() data_timer = Timer() training_losses = [] # Setup learning rates lr_steps = [int(k) for k in cfg.TRAIN.LEARNING_RATES.keys()] #Setup the lr_scheduler self.lr_scheduler = lr_scheduler.MultiStepLR(self.optimizer, lr_steps, gamma=0.1) start_iter = 0 # Resume training if cfg.TRAIN.RESUME_TRAIN: self.load(cfg.CONST.WEIGHTS) start_iter = cfg.TRAIN.INITIAL_ITERATION # Main training loop train_loader_iter = iter(train_loader) for train_ind in range(start_iter, cfg.TRAIN.NUM_ITERATION + 1): self.lr_scheduler.step() data_timer.tic() try: batch_img, batch_voxel = train_loader_iter.next() except StopIteration: train_loader_iter = iter(train_loader) batch_img, batch_voxel = train_loader_iter.next() data_timer.toc() if self.net.is_x_tensor4: batch_img = batch_img[0] # Apply one gradient step train_timer.tic() loss = self.train_loss(batch_img, batch_voxel) train_timer.toc() training_losses.append(loss.item()) # Decrease learning rate at certain points if train_ind in lr_steps: #for pytorch optimizer, learning rate can only be set when the optimizer is created #or using torch.optim.lr_scheduler print('Learing rate decreased to %f: ' % cfg.TRAIN.LEARNING_RATES[str(train_ind)]) # Debugging modules # # Print status, run validation, check divergence, and save model. if train_ind % cfg.TRAIN.PRINT_FREQ == 0: # Print the current loss print('%s Iter: %d Loss: %f' % (datetime.now(), train_ind, loss)) if train_ind % cfg.TRAIN.VALIDATION_FREQ == 0 and val_loader is not None: # Print test loss and params to check convergence every N iterations val_losses = 0 val_num_iter = min(cfg.TRAIN.NUM_VALIDATION_ITERATIONS, len(val_loader)) val_loader_iter = iter(val_loader) for i in range(val_num_iter): batch_img, batch_voxel = val_loader_iter.next() val_loss = self.train_loss(batch_img, batch_voxel) val_losses += val_loss var_losses_mean = val_losses / val_num_iter print('%s Test loss: %f' % (datetime.now(), var_losses_mean)) if train_ind % cfg.TRAIN.NAN_CHECK_FREQ == 0: # Check that the network parameters are all valid nan_or_max_param = max_or_nan(self.net.parameters()) if has_nan(nan_or_max_param): print('NAN detected') break if (train_ind % cfg.TRAIN.SAVE_FREQ == 0 and not train_ind == 0) or \ (train_ind == cfg.TRAIN.NUM_ITERATION): # Save the checkpoint every a few iterations or at the end. self.save(training_losses, save_dir, train_ind) #loss is a Variable containing torch.FloatTensor of size 1 if loss.item() > cfg.TRAIN.LOSS_LIMIT: print("Cost exceeds the threshold. Stop training") break
def train(model, data_loader, val_data_loader, config, transform_data_fn=None): device = config.device_id distributed = get_world_size() > 1 # Set up the train flag for batch normalization model.train() # Configuration writer = SummaryWriter(log_dir=config.log_dir) data_timer, iter_timer = Timer(), Timer() fw_timer, bw_timer, ddp_timer = Timer(), Timer(), Timer() data_time_avg, iter_time_avg = AverageMeter(), AverageMeter() fw_time_avg, bw_time_avg, ddp_time_avg = AverageMeter(), AverageMeter( ), AverageMeter() losses, scores = AverageMeter(), AverageMeter() optimizer = initialize_optimizer(model.parameters(), config) scheduler = initialize_scheduler(optimizer, config) criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label) writer = SummaryWriter(log_dir=config.log_dir) # Train the network logging.info('===> Start training on {} GPUs, batch-size={}'.format( get_world_size(), config.batch_size * get_world_size())) best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True if config.resume: checkpoint_fn = config.resume + '/weights.pth' if osp.isfile(checkpoint_fn): logging.info("=> loading checkpoint '{}'".format(checkpoint_fn)) state = torch.load( checkpoint_fn, map_location=lambda s, l: default_restore_location(s, 'cpu')) curr_iter = state['iteration'] + 1 epoch = state['epoch'] load_state(model, state['state_dict']) if config.resume_optimizer: scheduler = initialize_scheduler(optimizer, config, last_step=curr_iter) optimizer.load_state_dict(state['optimizer']) if 'best_val' in state: best_val_miou = state['best_val'] best_val_iter = state['best_val_iter'] logging.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_fn, state['epoch'])) else: raise ValueError( "=> no checkpoint found at '{}'".format(checkpoint_fn)) data_iter = data_loader.__iter__() # (distributed) infinite sampler while is_training: for iteration in range(len(data_loader) // config.iter_size): optimizer.zero_grad() data_time, batch_loss, batch_score = 0, 0, 0 iter_timer.tic() # set random seed for every iteration for trackability _set_seed(config, curr_iter) for sub_iter in range(config.iter_size): # Get training data data_timer.tic() coords, input, target = data_iter.next() # For some networks, making the network invariant to even, odd coords is important coords[:, :3] += (torch.rand(3) * 100).type_as(coords) # Preprocess input color = input[:, :3].int() if config.normalize_color: input[:, :3] = input[:, :3] / 255. - 0.5 sinput = SparseTensor(input, coords).to(device) data_time += data_timer.toc(False) # Feed forward fw_timer.tic() inputs = (sinput, ) if config.wrapper_type == 'None' else ( sinput, coords, color) # model.initialize_coords(*init_args) soutput = model(*inputs) # The output of the network is not sorted target = target.long().to(device) loss = criterion(soutput.F, target.long()) # Compute and accumulate gradient loss /= config.iter_size pred = get_prediction(data_loader.dataset, soutput.F, target) score = precision_at_one(pred, target) fw_timer.toc(False) bw_timer.tic() # bp the loss loss.backward() bw_timer.toc(False) # gather information logging_output = { 'loss': loss.item(), 'score': score / config.iter_size } ddp_timer.tic() if distributed: logging_output = all_gather_list(logging_output) logging_output = { w: np.mean([a[w] for a in logging_output]) for w in logging_output[0] } batch_loss += logging_output['loss'] batch_score += logging_output['score'] ddp_timer.toc(False) # Update number of steps optimizer.step() scheduler.step() data_time_avg.update(data_time) iter_time_avg.update(iter_timer.toc(False)) fw_time_avg.update(fw_timer.diff) bw_time_avg.update(bw_timer.diff) ddp_time_avg.update(ddp_timer.diff) losses.update(batch_loss, target.size(0)) scores.update(batch_score, target.size(0)) if curr_iter >= config.max_iter: is_training = False break if curr_iter % config.stat_freq == 0 or curr_iter == 1: lrs = ', '.join( ['{:.3e}'.format(x) for x in scheduler.get_lr()]) debug_str = "===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format( epoch, curr_iter, len(data_loader) // config.iter_size, losses.avg, lrs) debug_str += "Score {:.3f}\tData time: {:.4f}, Forward time: {:.4f}, Backward time: {:.4f}, DDP time: {:.4f}, Total iter time: {:.4f}".format( scores.avg, data_time_avg.avg, fw_time_avg.avg, bw_time_avg.avg, ddp_time_avg.avg, iter_time_avg.avg) logging.info(debug_str) # Reset timers data_time_avg.reset() iter_time_avg.reset() # Write logs writer.add_scalar('training/loss', losses.avg, curr_iter) writer.add_scalar('training/precision_at_1', scores.avg, curr_iter) writer.add_scalar('training/learning_rate', scheduler.get_lr()[0], curr_iter) losses.reset() scores.reset() # Save current status, save before val to prevent occational mem overflow if curr_iter % config.save_freq == 0: checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter) # Validation if curr_iter % config.val_freq == 0: val_miou = validate(model, val_data_loader, writer, curr_iter, config, transform_data_fn) if val_miou > best_val_miou: best_val_miou = val_miou best_val_iter = curr_iter checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter, "best_val") logging.info("Current best mIoU: {:.3f} at iter {}".format( best_val_miou, best_val_iter)) # Recover back model.train() if curr_iter % config.empty_cache_freq == 0: # Clear cache torch.cuda.empty_cache() # End of iteration curr_iter += 1 epoch += 1 # Explicit memory cleanup if hasattr(data_iter, 'cleanup'): data_iter.cleanup() # Save the final model checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter) val_miou = validate(model, val_data_loader, writer, curr_iter, config, transform_data_fn) if val_miou > best_val_miou: best_val_miou = val_miou best_val_iter = curr_iter checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter, "best_val") logging.info("Current best mIoU: {:.3f} at iter {}".format( best_val_miou, best_val_iter))
class TensorboardWriter(): def __init__(self, log_dir, logger, enabled): self.writer = None self.selected_module = "" if enabled: log_dir = str(log_dir) # Retrieve vizualization writer. succeeded = False for module in ["torch.utils.tensorboard", "tensorboardX"]: try: self.writer = importlib.import_module( module).SummaryWriter(log_dir) succeeded = True break except ImportError: succeeded = False self.selected_module = module if not succeeded: message = "Warning: visualization (Tensorboard) is configured to use, but currently not installed on " \ "this machine. Please install either TensorboardX with 'pip install tensorboardx', upgrade " \ "PyTorch to version >= 1.1 for using 'torch.utils.tensorboard' or turn off the option in " \ "the 'config.json' file." logger.warning(message) self.step = 0 self.mode = '' self.tb_writer_ftns = { 'add_scalar', 'add_scalars', 'add_image', 'add_images', 'add_audio', 'add_text', 'add_histogram', 'add_pr_curve', 'add_embedding' } self.tag_mode_exceptions = {'add_histogram', 'add_embedding'} self.timer = Timer() def set_step(self, step, mode='train'): self.mode = mode self.step = step if step == 0: self.timer.reset() else: duration = self.timer.check() self.add_scalar('steps_per_sec', 1 / duration) def __getattr__(self, name): """ If visualization is configured to use: return add_data() methods of tensorboard with additional information (step, tag) added. Otherwise: return a blank function handle that does nothing """ if name in self.tb_writer_ftns: add_data = getattr(self.writer, name, None) def wrapper(tag, data, *args, **kwargs): if add_data is not None: # add mode(train/valid) tag if name not in self.tag_mode_exceptions: tag = '{}/{}'.format(tag, self.mode) add_data(tag, data, self.step, *args, **kwargs) return wrapper else: # default action for returning methods defined in this class, set_step() for instance. try: attr = object.__getattr__(name) except AttributeError: raise AttributeError( "type object '{}' has no attribute '{}'".format( self.selected_module, name)) return attr
def test(model, data_loader, config, transform_data_fn=None, has_gt=True, validation=None, epoch=None): device = get_torch_device(config.is_cuda) dataset = data_loader.dataset num_labels = dataset.NUM_LABELS global_timer, data_timer, iter_timer = Timer(), Timer(), Timer() criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label) alpha, gamma, eps = 1, 2, 1e-6 # Focal Loss parameters losses, scores, ious = AverageMeter(), AverageMeter(), 0 aps = np.zeros((0, num_labels)) hist = np.zeros((num_labels, num_labels)) if not config.is_train: checkpoint_fn = config.resume + '/weights.pth' if osp.isfile(checkpoint_fn): logging.info("=> loading checkpoint '{}'".format(checkpoint_fn)) state = torch.load(checkpoint_fn) model.load_state_dict(state['state_dict']) logging.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_fn, state['epoch'])) else: raise ValueError( "=> no checkpoint found at '{}'".format(checkpoint_fn)) if validation: logging.info('===> Start validating') else: logging.info('===> Start testing') global_timer.tic() data_iter = data_loader.__iter__() max_iter = len(data_loader) max_iter_unique = max_iter all_preds, all_labels, batch_losses, batch_loss = [], [], {}, 0 # Fix batch normalization running mean and std model.eval() # Clear cache (when run in val mode, cleanup training cache) torch.cuda.empty_cache() if config.save_prediction or config.test_original_pointcloud: if config.save_prediction: save_pred_dir = config.save_pred_dir os.makedirs(save_pred_dir, exist_ok=True) else: save_pred_dir = tempfile.mkdtemp() if os.listdir(save_pred_dir): raise ValueError(f'Directory {save_pred_dir} not empty. ' 'Please remove the existing prediction.') with torch.no_grad(): for iteration in range(max_iter): data_timer.tic() if config.return_transformation: coords, input, target, transformation = data_iter.next() else: coords, input, target = data_iter.next() transformation = None data_time = data_timer.toc(False) # Preprocess input iter_timer.tic() if config.wrapper_type != 'None': color = input[:, :3].int() if config.normalize_color: input[:, :3] = input[:, :3] / 255. - 0.5 sinput = SparseTensor(input, coords).to(device) # Feed forward inputs = (sinput, ) if config.wrapper_type == 'None' else (sinput, coords, color) soutput = model(*inputs) output = soutput.F pred = get_prediction(dataset, output, target).int() iter_time = iter_timer.toc(False) all_preds.append(pred.cpu().detach().numpy()) all_labels.append(target.cpu().detach().numpy()) if config.save_prediction or config.test_original_pointcloud: save_predictions(coords, pred, transformation, dataset, config, iteration, save_pred_dir) if has_gt: if config.evaluate_original_pointcloud: raise NotImplementedError('pointcloud') output, pred, target = permute_pointcloud( coords, pointcloud, transformation, dataset.label_map, output, pred) target_np = target.numpy() num_sample = target_np.shape[0] target = target.to(device) """# focal loss input_soft = nn.functional.softmax(output, dim=1) + eps focal_weight = torch.pow(-input_soft + 1., gamma) loss = (-alpha * focal_weight * torch.log(input_soft)).mean()""" loss = criterion(output, target.long()) batch_loss += loss losses.update(float(loss), num_sample) scores.update(precision_at_one(pred, target), num_sample) hist += fast_hist(pred.cpu().numpy().flatten(), target_np.flatten(), num_labels) ious = per_class_iu(hist) * 100 prob = torch.nn.functional.softmax(output, dim=1) ap = average_precision(prob.cpu().detach().numpy(), target_np) aps = np.vstack((aps, ap)) # Due to heavy bias in class, there exists class with no test label at all with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) ap_class = np.nanmean(aps, 0) * 100. if iteration % config.test_stat_freq == 0 and iteration > 0: preds = np.concatenate(all_preds) targets = np.concatenate(all_labels) to_ignore = [ i for i in range(len(targets)) if targets[i] == 255 ] preds_trunc = [ preds[i] for i in range(len(preds)) if i not in to_ignore ] targets_trunc = [ targets[i] for i in range(len(targets)) if i not in to_ignore ] cm = confusion_matrix(targets_trunc, preds_trunc, normalize='true') np.savetxt(config.log_dir + '/cm_epoch_{0}.txt'.format(epoch), cm) reordered_ious = dataset.reorder_result(ious) reordered_ap_class = dataset.reorder_result(ap_class) class_names = dataset.get_classnames() print_info(iteration, max_iter_unique, data_time, iter_time, has_gt, losses, scores, reordered_ious, hist, reordered_ap_class, class_names=class_names) if iteration % config.empty_cache_freq == 0: # Clear cache torch.cuda.empty_cache() batch_losses[epoch] = batch_loss global_time = global_timer.toc(False) reordered_ious = dataset.reorder_result(ious) reordered_ap_class = dataset.reorder_result(ap_class) class_names = dataset.get_classnames() print_info(iteration, max_iter_unique, data_time, iter_time, has_gt, losses, scores, reordered_ious, hist, reordered_ap_class, class_names=class_names) if not config.is_train: preds = np.concatenate(all_preds) targets = np.concatenate(all_labels) to_ignore = [i for i in range(len(targets)) if targets[i] == 255] preds_trunc = [ preds[i] for i in range(len(preds)) if i not in to_ignore ] targets_trunc = [ targets[i] for i in range(len(targets)) if i not in to_ignore ] cm = confusion_matrix(targets_trunc, preds_trunc, normalize='true') np.savetxt(config.log_dir + '/cm.txt', cm) if config.test_original_pointcloud: logging.info('===> Start testing on original pointcloud space.') dataset.test_pointcloud(save_pred_dir) logging.info("Finished test. Elapsed time: {:.4f}".format(global_time)) if validation: loss_file_name = "/val_loss.txt" with open(config.log_dir + loss_file_name, 'a') as val_loss_file: for key in batch_losses: val_loss_file.writelines('{0}, {1}\n'.format( batch_losses[key], key)) val_loss_file.close() return losses.avg, scores.avg, np.nanmean(ap_class), np.nanmean( per_class_iu(hist)) * 100, batch_losses else: return losses.avg, scores.avg, np.nanmean(ap_class), np.nanmean( per_class_iu(hist)) * 100
def train(model, data_loader, val_data_loader, config, transform_data_fn=None): device = get_torch_device(config.is_cuda) # Set up the train flag for batch normalization model.train() # Configuration writer = SummaryWriter(log_dir=config.log_dir) data_timer, iter_timer = Timer(), Timer() data_time_avg, iter_time_avg = AverageMeter(), AverageMeter() losses, scores = AverageMeter(), AverageMeter() optimizer = initialize_optimizer(model.parameters(), config) scheduler = initialize_scheduler(optimizer, config) criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label) writer = SummaryWriter(log_dir=config.log_dir) # Train the network logging.info('===> Start training') best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True if config.resume: checkpoint_fn = config.resume + '/weights.pth' if osp.isfile(checkpoint_fn): logging.info("=> loading checkpoint '{}'".format(checkpoint_fn)) state = torch.load(checkpoint_fn) curr_iter = state['iteration'] + 1 epoch = state['epoch'] model.load_state_dict(state['state_dict']) if config.resume_optimizer: scheduler = initialize_scheduler(optimizer, config, last_step=curr_iter) optimizer.load_state_dict(state['optimizer']) if 'best_val' in state: best_val_miou = state['best_val'] best_val_iter = state['best_val_iter'] logging.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_fn, state['epoch'])) else: raise ValueError( "=> no checkpoint found at '{}'".format(checkpoint_fn)) data_iter = data_loader.__iter__() while is_training: for iteration in range(len(data_loader) // config.iter_size): optimizer.zero_grad() data_time, batch_loss = 0, 0 iter_timer.tic() for sub_iter in range(config.iter_size): # Get training data data_timer.tic() if config.return_transformation: coords, input, target, pointcloud, transformation = data_iter.next( ) else: coords, input, target = data_iter.next() # For some networks, making the network invariant to even, odd coords is important coords[:, 1:] += (torch.rand(3) * 100).type_as(coords) # Preprocess input if config.normalize_color: input[:, :3] = input[:, :3] / 255. - 0.5 sinput = SparseTensor(input, coords).to(device) data_time += data_timer.toc(False) # model.initialize_coords(*init_args) soutput = model(sinput) # The output of the network is not sorted target = target.long().to(device) loss = criterion(soutput.F, target.long()) # Compute and accumulate gradient loss /= config.iter_size batch_loss += loss.item() loss.backward() # Update number of steps optimizer.step() scheduler.step() data_time_avg.update(data_time) iter_time_avg.update(iter_timer.toc(False)) pred = get_prediction(data_loader.dataset, soutput.F, target) score = precision_at_one(pred, target) losses.update(batch_loss, target.size(0)) scores.update(score, target.size(0)) if curr_iter >= config.max_iter: is_training = False break if curr_iter % config.stat_freq == 0 or curr_iter == 1: lrs = ', '.join( ['{:.3e}'.format(x) for x in scheduler.get_lr()]) debug_str = "===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format( epoch, curr_iter, len(data_loader) // config.iter_size, losses.avg, lrs) debug_str += "Score {:.3f}\tData time: {:.4f}, Iter time: {:.4f}".format( scores.avg, data_time_avg.avg, iter_time_avg.avg) logging.info(debug_str) # Reset timers data_time_avg.reset() iter_time_avg.reset() # Write logs writer.add_scalar('training/loss', losses.avg, curr_iter) writer.add_scalar('training/precision_at_1', scores.avg, curr_iter) writer.add_scalar('training/learning_rate', scheduler.get_lr()[0], curr_iter) losses.reset() scores.reset() # Save current status, save before val to prevent occational mem overflow if curr_iter % config.save_freq == 0: checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter) # Validation if curr_iter % config.val_freq == 0: val_miou = validate(model, val_data_loader, writer, curr_iter, config, transform_data_fn) if val_miou > best_val_miou: best_val_miou = val_miou best_val_iter = curr_iter checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter, "best_val") logging.info("Current best mIoU: {:.3f} at iter {}".format( best_val_miou, best_val_iter)) # Recover back model.train() # End of iteration curr_iter += 1 epoch += 1 # Explicit memory cleanup if hasattr(data_iter, 'cleanup'): data_iter.cleanup() # Save the final model checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter) val_miou = validate(model, val_data_loader, writer, curr_iter, config, transform_data_fn) if val_miou > best_val_miou: best_val_miou = val_miou best_val_iter = curr_iter checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter, "best_val") logging.info("Current best mIoU: {:.3f} at iter {}".format( best_val_miou, best_val_iter))
def test(model, data_loader, config, transform_data_fn=None, has_gt=True): device = get_torch_device(config.is_cuda) dataset = data_loader.dataset num_labels = dataset.NUM_LABELS global_timer, data_timer, iter_timer = Timer(), Timer(), Timer() criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label) losses, scores, ious = AverageMeter(), AverageMeter(), 0 aps = np.zeros((0, num_labels)) hist = np.zeros((num_labels, num_labels)) logging.info('===> Start testing') global_timer.tic() data_iter = data_loader.__iter__() max_iter = len(data_loader) max_iter_unique = max_iter # Fix batch normalization running mean and std model.eval() # Clear cache (when run in val mode, cleanup training cache) torch.cuda.empty_cache() if config.save_prediction or config.test_original_pointcloud: if config.save_prediction: save_pred_dir = config.save_pred_dir os.makedirs(save_pred_dir, exist_ok=True) else: save_pred_dir = tempfile.mkdtemp() if os.listdir(save_pred_dir): raise ValueError(f'Directory {save_pred_dir} not empty. ' 'Please remove the existing prediction.') with torch.no_grad(): for iteration in range(max_iter): data_timer.tic() if config.return_transformation: coords, input, target, transformation = data_iter.next() else: coords, input, target = data_iter.next() transformation = None data_time = data_timer.toc(False) # Preprocess input iter_timer.tic() if config.wrapper_type != 'None': color = input[:, :3].int() if config.normalize_color: input[:, :3] = input[:, :3] / 255. - 0.5 sinput = SparseTensor(input, coords).to(device) # Feed forward inputs = (sinput, ) if config.wrapper_type == 'None' else (sinput, coords, color) soutput = model(*inputs) output = soutput.F pred = get_prediction(dataset, output, target).int() iter_time = iter_timer.toc(False) if config.save_prediction or config.test_original_pointcloud: save_predictions(coords, pred, transformation, dataset, config, iteration, save_pred_dir) if has_gt: if config.evaluate_original_pointcloud: raise NotImplementedError('pointcloud') output, pred, target = permute_pointcloud( coords, pointcloud, transformation, dataset.label_map, output, pred) target_np = target.numpy() num_sample = target_np.shape[0] target = target.to(device) cross_ent = criterion(output, target.long()) losses.update(float(cross_ent), num_sample) scores.update(precision_at_one(pred, target), num_sample) hist += fast_hist(pred.cpu().numpy().flatten(), target_np.flatten(), num_labels) ious = per_class_iu(hist) * 100 prob = torch.nn.functional.softmax(output, dim=1) ap = average_precision(prob.cpu().detach().numpy(), target_np) aps = np.vstack((aps, ap)) # Due to heavy bias in class, there exists class with no test label at all with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) ap_class = np.nanmean(aps, 0) * 100. if iteration % config.test_stat_freq == 0 and iteration > 0: reordered_ious = dataset.reorder_result(ious) reordered_ap_class = dataset.reorder_result(ap_class) class_names = dataset.get_classnames() print_info(iteration, max_iter_unique, data_time, iter_time, has_gt, losses, scores, reordered_ious, hist, reordered_ap_class, class_names=class_names) if iteration % config.empty_cache_freq == 0: # Clear cache torch.cuda.empty_cache() global_time = global_timer.toc(False) reordered_ious = dataset.reorder_result(ious) reordered_ap_class = dataset.reorder_result(ap_class) class_names = dataset.get_classnames() print_info(iteration, max_iter_unique, data_time, iter_time, has_gt, losses, scores, reordered_ious, hist, reordered_ap_class, class_names=class_names) if config.test_original_pointcloud: logging.info('===> Start testing on original pointcloud space.') dataset.test_pointcloud(save_pred_dir) logging.info("Finished test. Elapsed time: {:.4f}".format(global_time)) return losses.avg, scores.avg, np.nanmean(ap_class), np.nanmean( per_class_iu(hist)) * 100
def train(self, train_queue, val_queue=None): ''' Given data queues, train the network ''' # Parameter directory save_dir = os.path.join(cfg.DIR.OUT_PATH) if not os.path.exists(save_dir): os.makedirs(save_dir) # Timer for the training op and parallel data loading op. train_timer = Timer() data_timer = Timer() training_losses = [] start_iter = 0 # Resume training if cfg.TRAIN.RESUME_TRAIN: self.net.load(cfg.CONST.WEIGHTS) start_iter = cfg.TRAIN.INITIAL_ITERATION # Setup learning rates lr = cfg.TRAIN.DEFAULT_LEARNING_RATE lr_steps = [int(k) for k in cfg.TRAIN.LEARNING_RATES.keys()] print('Set the learning rate to %f.' % lr) self.set_lr(lr) # Main training loop for train_ind in range(start_iter, cfg.TRAIN.NUM_ITERATION + 1): data_timer.tic() batch_img, batch_voxel = train_queue.get() data_timer.toc() if self.net.is_x_tensor4: batch_img = batch_img[0] # Apply one gradient step train_timer.tic() loss = self.train_loss(batch_img, batch_voxel) train_timer.toc() training_losses.append(loss) # Decrease learning rate at certain points if train_ind in lr_steps: # edict only takes string for key. Hacky way self.set_lr(np.float(cfg.TRAIN.LEARNING_RATES[str(train_ind)])) print('Learing rate decreased to %f: ' % self.lr.get_value()) # Debugging modules # # Print status, run validation, check divergence, and save model. if train_ind % cfg.TRAIN.PRINT_FREQ == 0: # Print the current loss print('%s Iter: %d Loss: %f' % (datetime.now(), train_ind, loss)) if train_ind % cfg.TRAIN.VALIDATION_FREQ == 0 and val_queue is not None: # Print test loss and params to check convergence every N iterations val_losses = [] for i in range(cfg.TRAIN.NUM_VALIDATION_ITERATIONS): batch_img, batch_voxel = val_queue.get() _, val_loss, _ = self.test_output(batch_img, batch_voxel) val_losses.append(val_loss) print('%s Test loss: %f' % (datetime.now(), np.mean(val_losses))) if train_ind % cfg.TRAIN.NAN_CHECK_FREQ == 0: # Check that the network parameters are all valid max_param = max_or_nan(self.net.params) if np.isnan(max_param): print('NAN detected') break if train_ind % cfg.TRAIN.SAVE_FREQ == 0 and not train_ind == 0: self.save(training_losses, save_dir, train_ind) if loss > cfg.TRAIN.LOSS_LIMIT: print("Cost exceeds the threshold. Stop training") break
class Solver(object): """Solver for generic networks. """ def __init__(self, net, graph, is_training): self.net = net self.graph = graph self.is_training = is_training self.num_epochs = cfg.TRAIN.NUM_EPOCHS self.train_timer = Timer() self.data_timer = Timer() self.global_step = slim.get_or_create_global_step() # Build basic ops and tensors if self.is_training: self.net.build_train_ops(self.global_step) if isinstance(net, Classifier): self.saver = tf.train.Saver(var_list=net.vars_to_restore, name='saver') else: self.saver = tf.train.Saver(max_to_keep=None, name='saver_all_var') # Save all vars self.init_ops = self.build_init_ops() self.val_loss_ph = tf.placeholder(tf.float32, shape=(), name='val_loss_ph') self.net.build_summary_ops(self.graph) self.val_loss_summary = tf.summary.scalar(name='val_loss', tensor=self.val_loss_ph) print('saver variables:') print_tensor_shapes(net.vars_to_restore, prefix='-->') def build_init_ops(self): """Builds the init ops. Returns: init_op: Initialization op. ready_op: Initialization op. local_init_op: Initialization op. """ with tf.name_scope('init_ops'): init_op = tf.global_variables_initializer() ready_op = tf.report_uninitialized_variables() local_init_op = tf.group(tf.local_variables_initializer(), tf.tables_initializer()) return init_op, ready_op, local_init_op def restore_checkpoint(self, sess): """Restores the network to a previously saved checkpoint if a path is provided from the config. Args: sess: Current session. """ if cfg.DIR.CKPT_PATH is not None: tf.logging.info('Restoring checkpoint.') self.saver.restore(sess, cfg.DIR.CKPT_PATH) else: tf.logging.info('Using network with random weights.') def train_step(self, sess, train_queue, step): """Executes a train step, including saving the summaries if appropriate. Args: sess: Current session. train_queue: Data queue containing train set minibatches. step: The current training iteration (0-based indexing). Returns: print_dict: Dictionary of items such as losses (for just one minibatch) to print. """ print_dict = self.net.train_step(sess, train_queue, step, data_timer=self.data_timer) return print_dict def val_step(self, sess, val_queue): """Executes a validation step, which simply computes the loss. Args: sess: Current session. val_queue: Data queue containing validation set minibatches. Returns: val_loss: Loss for a single minibatch of validation data. """ raise NotImplementedError('Must be implemented by a subclass.') def validate(self, sess, val_queue, step, num_val_iter): raise NotImplementedError('Must be implemented by a subclass.') def train(self, train_iters_per_epoch, train_queue, val_iters_per_epoch=None, val_queue=None): """Train the network, computing the validation loss if val_iters_per_epoch and val_queue are provided. Args: train_iters_per_epoch: Number of iterations in a single epoch of train data, as computed by the data process. train_queue: Data queue containing minibatches of train data. val_iters_per_epoch: Optional input describing the number of iterations in a single epoch of validation data, as computed by the data process. val_queue: Optional input representing the data queue containing minibatches of validation data. """ if (val_iters_per_epoch is None and val_queue is not None) or \ (val_iters_per_epoch is not None and val_queue is None): raise ValueError('Need to input both val size and val queue.') if val_iters_per_epoch is not None and val_queue is not None: run_validation = True else: run_validation = False print('-------------- BEGIN TRAINING --------------') num_train_iter = get_num_iterations(train_iters_per_epoch, num_epochs=cfg.TRAIN.NUM_EPOCHS, disp=True) num_val_iter = 20000 // cfg.CONST.BATCH_SIZE # Evaluate on roughly 20000 samples if val_iters_per_epoch is not None: num_val_iter = min(num_val_iter, val_iters_per_epoch) with tf.Session() as sess: sess.run(self.init_ops) self.restore_checkpoint(sess) # Train loop for step in range(num_train_iter): # For randomized model # self.save(sess, step) # break self.train_timer.tic() print_dict = self.train_step(sess, train_queue, step) self.train_timer.toc() if (step + 1) % cfg.CONST.PRINT_FREQ == 0: print_dict['queue size'] = (str(train_queue.qsize()) + '/' + str(cfg.CONST.QUEUE_CAPACITY)) print_dict[ 'data fetch (sec/step)'] = '%.2f' % self.data_timer.average_time print_dict[ 'train step (sec/step)'] = '%.2f' % self.train_timer.average_time print_train_step_data(print_dict, step) # Reset timers self.data_timer.reset() self.train_timer.reset() if (run_validation is True) and ( (step + 1) % cfg.TRAIN.VALIDATION_FREQ == 0): validation_val = self.validate(sess, val_queue, step, num_val_iter) if validation_val == -1: # Training termination flag tf.logging.info( 'Terminating train loop due to decreasing validation performance.' ) break else: val_summary = sess.run( self.val_loss_summary, feed_dict={self.val_loss_ph: validation_val}) self.net.summary_writer.add_summary( val_summary, (step + 1)) if (step + 1) % cfg.TRAIN.CKPT_FREQ == 0: self.save(sess, step) # save model after training self.save(sess, step) def forward_pass_batches(self, sess, minibatch_generator): """Forward pass a series of minibatches from the minibatch generator. """ minibatch_list = [] outputs_list = [] for step, minibatch in enumerate(minibatch_generator): np.random.seed(1234) try: outputs = self.net.forward_pass(sess, minibatch) except KeyError: outputs = self.net.forward_pass(sess, minibatch, full_feed_dict=True) # Reduce size of minibatch so we can pass through entire val set if isinstance(self.net, LBA): minibatch_save = { 'raw_embedding_batch': minibatch['raw_embedding_batch'], 'caption_label_batch': minibatch['caption_label_batch'], 'category_list': minibatch['category_list'], 'model_list': minibatch['model_list'], } minibatch = minibatch_save if isinstance(self.net, Classifier): minibatch_save = { 'class_label_batch': minibatch['class_label_batch'], 'model_id_list': minibatch['model_id_list'], } minibatch = minibatch_save minibatch_list.append(minibatch) outputs_list.append(outputs) if (step + 1) % 100 == 0: tf.logging.info('%s Step: %d' % (str(datetime.now()), step + 1)) return minibatch_list, outputs_list def val_phase_minibatch_generator(self, val_queue, num_val_iter): """Return a minibatch generator for the test phase. """ for step in range(num_val_iter): minibatch = val_queue.get() minibatch['test_queue'] = True yield minibatch def evaluate(self, minibatch_list, outputs_list): """Do some evaluation of the outputs. """ pass def test(self, test_process, test_queue, num_minibatches=None, save_outputs=False): """Compute (and optionally save) the outputs for the test set. This function only computes the outputs for num_minibatches minibatches. Args: test_process: Data process for the test data. test_queue: Queue containing minibatches of test data. num_minibatches: Number of minibatches to compute the outputs for. save_outputs: Boolean flag for whether or not to save the outputs. """ with tf.Session() as sess: if cfg.DIR.CKPT_PATH is None: raise ValueError('Please provide a checkpoint.') sess.run(self.init_ops) else: self.restore_checkpoint(sess) def test_phase_minibatch_generator(): for step, minibatch in enumerate( get_while_running(test_process, test_queue)): if (num_minibatches is not None) and (step == num_minibatches): break yield minibatch minibatch_generator = test_phase_minibatch_generator() minibatch_list, outputs_list = self.forward_pass_batches( sess, minibatch_generator) self.evaluate(minibatch_list, outputs_list) if save_outputs: self.save_outputs(minibatch_list, outputs_list) def save(self, sess, step): """Save a checkpoint. """ ckpt_path = os.path.join(cfg.DIR.LOG_PATH, 'model.ckpt') tf.logging.info('Saving checkpoint (step %d).', (step + 1)) self.saver.save(sess, ckpt_path, global_step=(step + 1)) def save_outputs(self, minibatch_list, outputs_list, filename=None): """Save the outputs (from the self.test). """ raise NotImplementedError('Must be implemented by a subclass.')
class ModelRunner(object): """ Train, evaluate, save and restore model. """ def __init__(self, config): self._config = config self._train_config = config['train'] self._model_config = config['model'] self._data_config = config['data'] # the folders for model and tensor board self._training_folder = None self._model_folder = None self._tfb_folder = None # build model self._model = BaseModel.generate_model_from_config(self._model_config) self._model_saver = tf.train.Saver(max_to_keep=0) # other setting self._timer = Timer('m') @property def model(self): return self._model def train_model(self, sess, train_data_provider, valid_data_provider, test_data_provider=None): epoch_num, max_epoch, lr_scheduler, continue_training = self._load_train_status( ) # get logger for this training logger = get_logger(os.path.join(self._training_folder, 'training.log')) # training from scratch or continue training if continue_training: # trained model existed, then restore it. model_path = self.restore_model(sess) epoch_num += 1 logger.info(f'Restore model from {model_path}') else: # initialize variables sess.run([tf.global_variables_initializer()]) logger.info( f'Training starts on dataset {self._data_config["data_name"]}') logger.info( f'----------Trainable parameter count: {get_num_trainable_params()} of model {self._model_folder}' ) best_valid_loss = float('inf') lr = lr_scheduler.get_lr() while lr > 0 and epoch_num <= max_epoch: # Train loss, _, _, elapse = self._run_epoch(sess, train_data_provider, lr, is_train=True) logger.info( f'Epoch {epoch_num}: train loss - {loss}, learning rate - {lr}.' f' Cost time: {elapse:.3f}{self._timer.unit()}') # Valid loss, _, _, _ = self._run_epoch(sess, valid_data_provider, lr, is_train=False) # Update after train and valid # update lr lr = lr_scheduler.update_lr(loss=loss, epoch_num=epoch_num) # update train_config self._update_train_config(lr, epoch_num) if loss < best_valid_loss: best_valid_loss = loss # save best model self._save_model_with_config(sess) # Test loss, preds, labels, elapse = self._run_epoch( sess, test_data_provider, lr, is_train=False) metrics = test_data_provider.get_metrics(preds, labels) str_metrics = str(metrics) logger.info(f'---Test Loss: {loss}, metrics: {str_metrics}. ' f'Cost time: {elapse:.3f}{self._timer.unit()}') epoch_num += 1 logger.info('Training Finished!') def evaluate_model(self, sess, data_provider): self.restore_model(sess) loss, preds, labels, _ = self._run_epoch(sess, data_provider, lr=0, is_train=False) metrics = data_provider.get_metrics(preds, labels) return preds, labels, metrics def restore_model(self, sess): train_config = self._train_config model_path = train_config['model_path'] self._model_saver.restore(sess, model_path) return model_path def _load_train_status(self): """ Load training status. Create base folders if the config presents a new training. :return: """ train_config = self._train_config # assign parameters epoch_num = train_config.get('epoch') max_epoch = train_config.get('max_epoch') # get lr scheduler lr_scheduler = LRScheduler.generate_scheduler_by_name( train_config.get('lr_scheduler'), **train_config) model_path = train_config.get('model_path') if model_path: # continue last training continue_training = True # read corresponding training path self._model_folder = os.path.dirname(model_path) self._training_folder = os.path.dirname(self._model_folder) self._tfb_folder = create_folder(self._training_folder, 'tfbs') else: # training from scratch continue_training = False # create model and tensorflow board folder time = datetime.datetime.now() timestamp = datetime.datetime.strftime(time, '%m%d%H%M%S') model_foldername = make_config_string( self._config['model']) + '_' + timestamp self._training_folder = create_folder(self._config['base_dir'], model_foldername) self._model_folder = create_folder(self._training_folder, 'models') self._tfb_folder = create_folder(self._training_folder, 'tfbs') return epoch_num, max_epoch, lr_scheduler, continue_training def _save_model_with_config(self, sess): train_config = self._train_config # update model path in train config train_config['model_path'] = os.path.join( self._model_folder, 'model-' + str(train_config['epoch'])) # save model self._model_saver.save(sess, train_config['model_path']) # save config to yaml file config_path = os.path.join( self._model_folder, 'config-' + str(train_config['epoch']) + '.yaml') with open(config_path, 'w') as f: yaml.dump(self._config, f) def _update_train_config(self, lr, epoch): train_config = self._train_config train_config['lr'] = lr train_config['epoch'] = epoch def _run_epoch(self, sess, data_provider, lr, is_train): """ :param sess: :param data_provider: :param lr: :param is_train: :return: [epoch_loss, epoch_pred, epoch_label, epoch_cost_time] """ self._timer.start() model = self._model if is_train: run_func = model.train else: run_func = model.predict loss_list = [] pred_list = [] real_list = [] for batch_data in data_provider.iterate_batch_data(): loss, pred, real = run_func(sess, batch_data, lr=lr) loss_list.append(loss) pred_list.append(pred) real_list.append(real) # shape -> [n_items, horizon, D] epoch_preds = concat_arrs_of_dict(pred_list) epoch_reals = concat_arrs_of_dict(real_list) epoch_avg_loss = np.mean(loss_list) # inverse scaling data epoch_preds = data_provider.epoch_inverse_scaling(epoch_preds) epoch_reals = data_provider.epoch_inverse_scaling(epoch_reals) return epoch_avg_loss, epoch_preds, epoch_reals, self._timer.end()
def train(model, data_loader, val_data_loader, config, transform_data_fn=None): all_losses = [] device = get_torch_device(config.is_cuda) # Set up the train flag for batch normalization model.train() # Configuration writer = SummaryWriter(log_dir=config.log_dir) data_timer, iter_timer = Timer(), Timer() data_time_avg, iter_time_avg = AverageMeter(), AverageMeter() losses, scores, batch_losses = AverageMeter(), AverageMeter(), {} optimizer = initialize_optimizer(model.parameters(), config) scheduler = initialize_scheduler(optimizer, config) criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label) alpha, gamma, eps = 1, 2, 1e-6 writer = SummaryWriter(log_dir=config.log_dir) # Train the network logging.info('===> Start training') best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True if config.resume: checkpoint_fn = config.resume + '/weights.pth' if osp.isfile(checkpoint_fn): logging.info("=> loading checkpoint '{}'".format(checkpoint_fn)) state = torch.load(checkpoint_fn) curr_iter = state['iteration'] + 1 epoch = state['epoch'] model.load_state_dict(state['state_dict']) if config.resume_optimizer: scheduler = initialize_scheduler(optimizer, config, last_step=curr_iter) optimizer.load_state_dict(state['optimizer']) if 'best_val' in state: best_val_miou = state['best_val'] best_val_iter = state['best_val_iter'] logging.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_fn, state['epoch'])) else: raise ValueError( "=> no checkpoint found at '{}'".format(checkpoint_fn)) data_iter = data_loader.__iter__() while is_training: print( "********************************** epoch N° {0} ************************" .format(epoch)) for iteration in range(len(data_loader) // config.iter_size): print("####### Iteration N° {0}".format(iteration)) optimizer.zero_grad() data_time, batch_loss = 0, 0 iter_timer.tic() for sub_iter in range(config.iter_size): print("------------- Sub_iteration N° {0}".format(sub_iter)) # Get training data data_timer.tic() coords, input, target = data_iter.next() print("len of coords : {0}".format(len(coords))) # For some networks, making the network invariant to even, odd coords is important coords[:, :3] += (torch.rand(3) * 100).type_as(coords) # Preprocess input color = input[:, :3].int() if config.normalize_color: input[:, :3] = input[:, :3] / 255. - 0.5 sinput = SparseTensor(input, coords).to(device) data_time += data_timer.toc(False) # Feed forward inputs = (sinput, ) if config.wrapper_type == 'None' else ( sinput, coords, color) # model.initialize_coords(*init_args) soutput = model(*inputs) # The output of the network is not sorted target = target.long().to(device) print("count of classes : {0}".format( np.unique(target.cpu().numpy(), return_counts=True))) print("target : {0}\ntarget_len : {1}".format( target, len(target))) print("target [0]: {0}".format(target[0])) input_soft = nn.functional.softmax(soutput.F, dim=1) + eps print("input_soft[0] : {0}".format(input_soft[0])) focal_weight = torch.pow(-input_soft + 1., gamma) print("focal_weight : {0}\nweight[0] : {1}".format( focal_weight, focal_weight[0])) focal_loss = (-alpha * focal_weight * torch.log(input_soft)).mean() loss = criterion(soutput.F, target.long()) print("focal_loss :{0}\nloss : {1}".format(focal_loss, loss)) # Compute and accumulate gradient loss /= config.iter_size #batch_loss += loss batch_loss += loss.item() print("batch_loss : {0}".format(batch_loss)) loss.backward() # Update number of steps optimizer.step() scheduler.step() data_time_avg.update(data_time) iter_time_avg.update(iter_timer.toc(False)) pred = get_prediction(data_loader.dataset, soutput.F, target) score = precision_at_one(pred, target) losses.update(batch_loss, target.size(0)) scores.update(score, target.size(0)) if curr_iter >= config.max_iter: is_training = False break if curr_iter % config.stat_freq == 0 or curr_iter == 1: lrs = ', '.join( ['{:.3e}'.format(x) for x in scheduler.get_lr()]) debug_str = "===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format( epoch, curr_iter, len(data_loader) // config.iter_size, losses.avg, lrs) debug_str += "Score {:.3f}\tData time: {:.4f}, Total iter time: {:.4f}".format( scores.avg, data_time_avg.avg, iter_time_avg.avg) logging.info(debug_str) # Reset timers data_time_avg.reset() iter_time_avg.reset() # Write logs writer.add_scalar('training/loss', losses.avg, curr_iter) writer.add_scalar('training/precision_at_1', scores.avg, curr_iter) writer.add_scalar('training/learning_rate', scheduler.get_lr()[0], curr_iter) losses.reset() scores.reset() # Save current status, save before val to prevent occational mem overflow if curr_iter % config.save_freq == 0: checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter) # Validation if curr_iter % config.val_freq == 0: val_miou, val_losses = validate(model, val_data_loader, writer, curr_iter, config, transform_data_fn, epoch) if val_miou > best_val_miou: best_val_miou = val_miou best_val_iter = curr_iter checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter, "best_val") logging.info("Current best mIoU: {:.3f} at iter {}".format( best_val_miou, best_val_iter)) # Recover back model.train() if curr_iter % config.empty_cache_freq == 0: # Clear cache torch.cuda.empty_cache() batch_losses[epoch] = batch_loss # End of iteration curr_iter += 1 with open(config.log_dir + "/train_loss.txt", 'a') as train_loss_log: train_loss_log.writelines('{0}, {1}\n'.format( batch_losses[epoch], epoch)) train_loss_log.close() epoch += 1 # Explicit memory cleanup if hasattr(data_iter, 'cleanup'): data_iter.cleanup() # Save the final model checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter) val_miou = validate(model, val_data_loader, writer, curr_iter, config, transform_data_fn, epoch)[0] if val_miou > best_val_miou: best_val_miou = val_miou best_val_iter = curr_iter checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter, "best_val") logging.info("Current best mIoU: {:.3f} at iter {}".format( best_val_miou, best_val_iter))
def test(model, data_loader, config, transform_data_fn=None, has_gt=True, save_pred=False, split=None, submit_dir=None): device = get_torch_device(config.is_cuda) dataset = data_loader.dataset num_labels = dataset.NUM_LABELS global_timer, data_timer, iter_timer = Timer(), Timer(), Timer() criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label) losses, scores, ious = AverageMeter(), AverageMeter(), 0 aps = np.zeros((0, num_labels)) hist = np.zeros((num_labels, num_labels)) # some cfgs concerning the usage of instance-level information config.save_pred = save_pred if split is not None: assert save_pred if config.save_pred: save_dict = {} save_dict['pred'] = [] save_dict['coord'] = [] logging.info('===> Start testing') global_timer.tic() data_iter = data_loader.__iter__() max_iter = len(data_loader) max_iter_unique = max_iter # Fix batch normalization running mean and std model.eval() # Clear cache (when run in val mode, cleanup training cache) torch.cuda.empty_cache() # semantic kitti label inverse mapping if config.submit: remap_lut = Remap().getRemapLUT() with torch.no_grad(): # Calc of the iou total_correct = np.zeros(num_labels) total_seen = np.zeros(num_labels) total_positive = np.zeros(num_labels) point_nums = np.zeros([19]) for iteration in range(max_iter): data_timer.tic() if config.return_transformation: coords, input, target, unique_map_list, inverse_map_list, pointcloud, transformation, filename = data_iter.next() else: coords, input, target, unique_map_list, inverse_map_list, filename = data_iter.next() data_time = data_timer.toc(False) if config.use_aux: assert target.shape[1] == 2 aux = target[:,1] target = target[:,0] else: aux = None # Preprocess input iter_timer.tic() if config.normalize_color: input[:, :3] = input[:, :3] / input[:,:3].max() - 0.5 coords_norm = coords[:,1:] / coords[:,1:].max() - 0.5 XYZ_INPUT = config.xyz_input # cat xyz into the rgb feature if XYZ_INPUT: input = torch.cat([coords_norm, input], dim=1) sinput = ME.SparseTensor(input, coords, device=device) # Feed forward if aux is not None: soutput = model(sinput) else: soutput = model(sinput, iter_ = iteration / max_iter, enable_point_branch=config.enable_point_branch) output = soutput.F if torch.isnan(output).sum() > 0: import ipdb; ipdb.set_trace() pred = get_prediction(dataset, output, target).int() assert sum([int(t.shape[0]) for t in unique_map_list]) == len(pred), "number of points in unique_map doesn't match predition, do not enable preprocessing" iter_time = iter_timer.toc(False) if config.save_pred or config.submit: # troublesome processing for splitting each batch's data, and export batch_ids = sinput.C[:,0] splits_at = torch.stack([torch.where(batch_ids == i)[0][-1] for i in torch.unique(batch_ids)]).int() splits_at = splits_at + 1 splits_at_leftshift_one = splits_at.roll(shifts=1) splits_at_leftshift_one[0] = 0 # len_per_batch = splits_at - splits_at_leftshift_one len_sum = 0 batch_id = 0 for start, end in zip(splits_at_leftshift_one, splits_at): len_sum += len(pred[int(start):int(end)]) pred_this_batch = pred[int(start):int(end)] coord_this_batch = pred[int(start):int(end)] if config.save_pred: save_dict['pred'].append(pred_this_batch[inverse_map_list[batch_id]]) else: # save submit result submission_path = filename[batch_id].replace(config.semantic_kitti_path, submit_dir).replace('velodyne', 'predictions').replace('.bin', '.label') parent_dir = Path(submission_path).parent.absolute() if not os.path.exists(parent_dir): os.makedirs(parent_dir) label_pred = pred_this_batch[inverse_map_list[batch_id]].cpu().numpy() label_pred = remap_lut[label_pred].astype(np.uint32) label_pred.tofile(submission_path) print(submission_path) batch_id += 1 assert len_sum == len(pred) # Unpack it to original length REVERT_WHOLE_POINTCLOUD = True # print('{}/{}'.format(iteration, max_iter)) if REVERT_WHOLE_POINTCLOUD: whole_pred = [] whole_target = [] for batch_ in range(config.batch_size): batch_mask_ = (soutput.C[:,0] == batch_).cpu().numpy() if batch_mask_.sum() == 0: # for empty batch, skip em continue try: whole_pred_ = soutput.F[batch_mask_][inverse_map_list[batch_]] except: import ipdb; ipdb.set_trace() whole_target_ = target[batch_mask_][inverse_map_list[batch_]] whole_pred.append(whole_pred_) whole_target.append(whole_target_) whole_pred = torch.cat(whole_pred, dim=0) whole_target = torch.cat(whole_target, dim=0) pred = get_prediction(dataset, whole_pred, whole_target).int() output = whole_pred target = whole_target if has_gt: target_np = target.numpy() num_sample = target_np.shape[0] target = target.to(device) output = output.to(device) cross_ent = criterion(output, target.long()) losses.update(float(cross_ent), num_sample) scores.update(precision_at_one(pred, target), num_sample) hist += fast_hist(pred.cpu().numpy().flatten(), target_np.flatten(), num_labels) # within fast hist, mark label should >=0 & < num_label to filter out 255 / -1 ious = per_class_iu(hist) * 100 prob = torch.nn.functional.softmax(output, dim=-1) pred = pred[target != -1] target = target[target != -1] # for _ in range(num_labels): # debug for SemKITTI: spvnas way of calc miou # total_seen[_] += torch.sum(target == _) # total_correct[_] += torch.sum((pred == target) & (target == _)) # total_positive[_] += torch.sum(pred == _) # ious_ = [] # for _ in range(num_labels): # if total_seen[_] == 0: # ious_.append(1) # else: # ious_.append(total_correct[_]/(total_seen[_] + total_positive[_] - total_correct[_])) # ious_ = torch.stack(ious_, dim=-1).cpu().numpy()*100 # print(np.nanmean(per_class_iu(hist)), np.nanmean(ious_)) # ious = np.array(ious_)*100 # calc the ratio of total points # for i_ in range(19): # point_nums[i_] += (target == i_).sum().detach() # skip calculating aps ap = average_precision(prob.cpu().detach().numpy(), target_np) aps = np.vstack((aps, ap)) # Due to heavy bias in class, there exists class with no test label at all with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) ap_class = np.nanmean(aps, 0) * 100. if iteration % config.test_stat_freq == 0 and iteration > 0 and not config.submit: reordered_ious = dataset.reorder_result(ious) reordered_ap_class = dataset.reorder_result(ap_class) # dirty fix for semnaticcKITTI has no getclassnames if hasattr(dataset, "class_names"): class_names = dataset.get_classnames() else: # semnantic KITTI class_names = None print_info( iteration, max_iter_unique, data_time, iter_time, has_gt, losses, scores, reordered_ious, hist, reordered_ap_class, class_names=class_names) if iteration % 5 == 0: # Clear cache torch.cuda.empty_cache() if config.save_pred: # torch.save(save_dict, os.path.join(config.log_dir, 'preds_{}_with_coord.pth'.format(split))) torch.save(save_dict, os.path.join(config.log_dir, 'preds_{}.pth'.format(split))) print("===> saved prediction result") global_time = global_timer.toc(False) save_map(model, config) reordered_ious = dataset.reorder_result(ious) reordered_ap_class = dataset.reorder_result(ap_class) if hasattr(dataset, "class_names"): class_names = dataset.get_classnames() else: class_names = None print_info( iteration, max_iter_unique, data_time, iter_time, has_gt, losses, scores, reordered_ious, hist, reordered_ap_class, class_names=class_names) logging.info("Finished test. Elapsed time: {:.4f}".format(global_time)) # Explicit memory cleanup if hasattr(data_iter, 'cleanup'): data_iter.cleanup() return losses.avg, scores.avg, np.nanmean(ap_class), np.nanmean(per_class_iu(hist)) * 100
output_directory = os.path.join(opts.output_path + "/outputs", model_name) checkpoint_directory, image_directory = prepare_sub_folder(output_directory) shutil.copy(opts.config, os.path.join(output_directory, 'config.yaml')) # copy config file to output folder # Start training iterations = trainer.resume(checkpoint_directory, hyperparameters=config) if opts.resume else 0 while True: for it, (images_a, images_b) in enumerate(zip(train_loader_a, train_loader_b)): trainer.update_learning_rate() images_a, images_b = images_a.cuda().detach(), images_b.cuda().detach() with Timer("Elapsed time in update: %f"): # Main training code trainer.dis_update(images_a, images_b, config) trainer.gen_update(images_a, images_b, config) torch.cuda.synchronize() # Dump training stats in log file if (iterations + 1) % config['log_iter'] == 0: print("Iteration: %08d/%08d" % (iterations + 1, max_iter)) write_loss(iterations, trainer, train_writer) # Write images if (iterations + 1) % config['image_save_iter'] == 0: with torch.no_grad(): test_image_outputs = trainer.sample(test_display_images_a, test_display_images_b)
def train_worker(gpu, num_devices, NetClass, data_loader, val_data_loader, config, transform_data_fn=None): if gpu is not None: print("Use GPU: {} for training".format(gpu)) rank = gpu addr = 23491 dist.init_process_group(backend="nccl", init_method="tcp://127.0.0.1:{}".format(addr), world_size=num_devices, rank=rank) # replace with DistributedSampler if config.multiprocess: from lib.dataloader_dist import InfSampler sampler = InfSampler(data_loader.dataset) data_loader = DataLoader(dataset=data_loader.dataset, num_workers=data_loader.num_workers, batch_size=data_loader.batch_size, collate_fn=data_loader.collate_fn, worker_init_fn=data_loader.worker_init_fn, sampler=sampler) if data_loader.dataset.NUM_IN_CHANNEL is not None: num_in_channel = data_loader.dataset.NUM_IN_CHANNEL else: num_in_channel = 3 num_labels = data_loader.dataset.NUM_LABELS # load model if config.pure_point: model = NetClass(num_class=config.num_labels, N=config.num_points, normal_channel=config.num_in_channel) else: if config.model == 'MixedTransformer': model = NetClass(config, num_class=num_labels, N=config.num_points, normal_channel=num_in_channel) elif config.model == 'MinkowskiVoxelTransformer': model = NetClass(config, num_in_channel, num_labels) elif config.model == 'MinkowskiTransformerNet': model = NetClass(config, num_in_channel, num_labels) elif "Res" in config.model: model = NetClass(num_in_channel, num_labels, config) else: model = NetClass(num_in_channel, num_labels, config) if config.weights == 'modelzoo': model.preload_modelzoo() elif config.weights.lower() != 'none': state = torch.load(config.weights) # delete the keys containing the attn since it raises size mismatch d = {k: v for k, v in state['state' '_dict'].items() if 'map' not in k} if config.weights_for_inner_model: model.model.load_state_dict(d) else: if config.lenient_weight_loading: matched_weights = load_state_with_same_shape( model, state['state_dict']) model_dict = model.state_dict() model_dict.update(matched_weights) model.load_state_dict(model_dict) else: model.load_state_dict(d, strict=False) torch.cuda.set_device(gpu) model.cuda(gpu) # use model with DDP model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[gpu], find_unused_parameters=False) # Synchronized batch norm model = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm(model) # Set up the train flag for batch normalization model.train() # Configuration data_timer, iter_timer = Timer(), Timer() data_time_avg, iter_time_avg = AverageMeter(), AverageMeter() regs, losses, scores = AverageMeter(), AverageMeter(), AverageMeter() optimizer = initialize_optimizer(model.parameters(), config) scheduler = initialize_scheduler(optimizer, config) criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label) # Train the network if rank == 0: setup_logger(config) logging.info('===> Start training') best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True if config.resume: # Test loaded ckpt first v_loss, v_score, v_mAP, v_mIoU = test(model, val_data_loader, config) checkpoint_fn = config.resume + '/weights.pth' if osp.isfile(checkpoint_fn): logging.info("=> loading checkpoint '{}'".format(checkpoint_fn)) state = torch.load(checkpoint_fn) curr_iter = state['iteration'] + 1 epoch = state['epoch'] # we skip attention maps because the shape won't match because voxel number is different # e.g. copyting a param with shape (23385, 8, 4) to (43529, 8, 4) d = { k: v for k, v in state['state_dict'].items() if 'map' not in k } # handle those attn maps we don't load from saved dict for k in model.state_dict().keys(): if k in d.keys(): continue d[k] = model.state_dict()[k] model.load_state_dict(d) if config.resume_optimizer: scheduler = initialize_scheduler(optimizer, config, last_step=curr_iter) optimizer.load_state_dict(state['optimizer']) if 'best_val' in state: best_val_miou = state['best_val'] best_val_iter = state['best_val_iter'] logging.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_fn, state['epoch'])) else: raise ValueError( "=> no checkpoint found at '{}'".format(checkpoint_fn)) data_iter = data_loader.__iter__() device = gpu # multitrain fed in the device if config.dataset == "SemanticKITTI": num_class = 19 config.normalize_color = False config.xyz_input = False val_freq_ = config.val_freq config.val_freq = config.val_freq * 10 # origianl val_freq_ elif config.dataset == 'S3DIS': num_class = 13 config.normalize_color = False config.xyz_input = False val_freq_ = config.val_freq elif config.dataset == "Nuscenes": num_class = 16 config.normalize_color = False config.xyz_input = False val_freq_ = config.val_freq config.val_freq = config.val_freq * 50 else: val_freq_ = config.val_freq num_class = 20 while is_training: total_correct_class = torch.zeros(num_class, device=device) total_iou_deno_class = torch.zeros(num_class, device=device) for iteration in range(len(data_loader) // config.iter_size): optimizer.zero_grad() data_time, batch_loss = 0, 0 iter_timer.tic() if curr_iter >= config.max_iter: # if curr_iter >= max(config.max_iter, config.epochs*(len(data_loader) // config.iter_size): is_training = False break elif curr_iter >= config.max_iter * (2 / 3): config.val_freq = val_freq_ * 2 # valid more freq on lower half for sub_iter in range(config.iter_size): # Get training data data_timer.tic() if config.return_transformation: coords, input, target, _, _, pointcloud, transformation = data_iter.next( ) else: coords, input, target, _, _ = data_iter.next( ) # ignore unique_map and inverse_map if config.use_aux: assert target.shape[1] == 2 aux = target[:, 1] target = target[:, 0] else: aux = None # For some networks, making the network invariant to even, odd coords is important coords[:, 1:] += (torch.rand(3) * 100).type_as(coords) # Preprocess input if config.normalize_color: input[:, :3] = input[:, :3] / input[:, :3].max() - 0.5 coords_norm = coords[:, 1:] / coords[:, 1:].max() - 0.5 # cat xyz into the rgb feature if config.xyz_input: input = torch.cat([coords_norm, input], dim=1) # print(device) sinput = SparseTensor(input, coords, device=device) # d = {} # d['coord'] = sinput.C # d['feat'] = sinput.F # torch.save(d, 'voxel.pth') # import ipdb; ipdb.set_trace() data_time += data_timer.toc(False) # model.initialize_coords(*init_args) if aux is not None: soutput = model(sinput, aux) elif config.enable_point_branch: soutput = model(sinput, iter_=curr_iter / config.max_iter, enable_point_branch=True) else: soutput = model( sinput, iter_=curr_iter / config.max_iter ) # feed in the progress of training for annealing inside the model # soutput = model(sinput) # The output of the network is not sorted target = target.view(-1).long().to(device) loss = criterion(soutput.F, target.long()) # ====== other loss regs ===== cur_loss = torch.tensor([0.], device=device) if hasattr(model, 'module.block1'): cur_loss = torch.tensor([0.], device=device) if hasattr(model.module.block1[0], 'vq_loss'): if model.block1[0].vq_loss is not None: cur_loss = torch.tensor([0.], device=device) for n, m in model.named_children(): if 'block' in n: cur_loss += m[ 0].vq_loss # m is the nn.Sequential obj, m[0] is the TRBlock logging.info( 'Cur Loss: {}, Cur vq_loss: {}'.format( loss, cur_loss)) loss += cur_loss if hasattr(model.module.block1[0], 'diverse_loss'): if model.block1[0].diverse_loss is not None: cur_loss = torch.tensor([0.], device=device) for n, m in model.named_children(): if 'block' in n: cur_loss += m[ 0].diverse_loss # m is the nn.Sequential obj, m[0] is the TRBlock logging.info( 'Cur Loss: {}, Cur diverse _loss: {}'.format( loss, cur_loss)) loss += cur_loss if hasattr(model.module.block1[0], 'label_reg'): if model.block1[0].label_reg is not None: cur_loss = torch.tensor([0.], device=device) for n, m in model.named_children(): if 'block' in n: cur_loss += m[ 0].label_reg # m is the nn.Sequential obj, m[0] is the TRBlock # logging.info('Cur Loss: {}, Cur diverse _loss: {}'.format(loss, cur_loss)) loss += cur_loss # Compute and accumulate gradient loss /= config.iter_size batch_loss += loss.item() if not config.use_sam: loss.backward() else: with model.no_sync(): loss.backward() # Update number of steps if not config.use_sam: optimizer.step() else: optimizer.first_step(zero_grad=True) soutput = model(sinput, iter_=curr_iter / config.max_iter, aux=starget) criterion(soutput.F, target.long()).backward() optimizer.second_step(zero_grad=True) if config.lr_warmup is None: scheduler.step() else: if curr_iter >= config.lr_warmup: scheduler.step() else: for g in optimizer.param_groups: g['lr'] = config.lr * (iteration + 1) / config.lr_warmup # CLEAR CACHE! torch.cuda.empty_cache() data_time_avg.update(data_time) iter_time_avg.update(iter_timer.toc(False)) pred = get_prediction(data_loader.dataset, soutput.F, target) score = precision_at_one(pred, target, ignore_label=-1) regs.update(cur_loss.item(), target.size(0)) losses.update(batch_loss, target.size(0)) scores.update(score, target.size(0)) # calc the train-iou for l in range(num_class): total_correct_class[l] += ((pred == l) & (target == l)).sum() total_iou_deno_class[l] += (((pred == l) & (target != -1)) | (target == l)).sum() if curr_iter % config.stat_freq == 0 or curr_iter == 1: lrs = ', '.join( ['{:.3e}'.format(g['lr']) for g in optimizer.param_groups]) IoU = ((total_correct_class) / (total_iou_deno_class + 1e-6)).mean() * 100. debug_str = "===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format( epoch, curr_iter, len(data_loader) // config.iter_size, losses.avg, lrs) debug_str += "Score {:.3f}\tIoU {:.3f}\tData time: {:.4f}, Iter time: {:.4f}".format( scores.avg, IoU.item(), data_time_avg.avg, iter_time_avg.avg) if regs.avg > 0: debug_str += "\n Additional Reg Loss {:.3f}".format( regs.avg) if rank == 0: logging.info(debug_str) # Reset timers data_time_avg.reset() iter_time_avg.reset() # Write logs losses.reset() scores.reset() # only save status on the 1st gpu if rank == 0: # Save current status, save before val to prevent occational mem overflow if curr_iter % config.save_freq == 0: checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter, save_inter=True) # Validation if curr_iter % config.val_freq == 0: val_miou = validate(model, val_data_loader, None, curr_iter, config, transform_data_fn ) # feedin None for SummaryWriter args if val_miou > best_val_miou: best_val_miou = val_miou best_val_iter = curr_iter checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter, "best_val", save_inter=True) if rank == 0: logging.info( "Current best mIoU: {:.3f} at iter {}".format( best_val_miou, best_val_iter)) # Recover back model.train() # End of iteration curr_iter += 1 IoU = (total_correct_class) / (total_iou_deno_class + 1e-6) if rank == 0: logging.info('train point avg class IoU: %f' % ((IoU).mean() * 100.)) epoch += 1 # Explicit memory cleanup if hasattr(data_iter, 'cleanup'): data_iter.cleanup() # Save the final model if rank == 0: checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter) v_loss, v_score, v_mAP, val_mIoU = test(model, val_data_loader, config) if val_miou > best_val_miou and rank == 0: best_val_miou = val_miou best_val_iter = curr_iter logging.info("Final best miou: {} at iter {} ".format( val_miou, curr_iter)) checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter, "best_val") logging.info("Current best mIoU: {:.3f} at iter {}".format( best_val_miou, best_val_iter))
def findsuitjob(stoper, date): processmsg = 'Storedailyinit start findsuit:' + date with Timer() as t: stoper.findsuit(date) processmsg = 'analyst done for %s in %d secs' % (date, t.secs) return processmsg
def train_point(model, data_loader, val_data_loader, config, transform_data_fn=None): device = get_torch_device(config.is_cuda) # Set up the train flag for batch normalization model.train() # Configuration data_timer, iter_timer = Timer(), Timer() data_time_avg, iter_time_avg = AverageMeter(), AverageMeter() losses, scores = AverageMeter(), AverageMeter() optimizer = initialize_optimizer(model.parameters(), config) scheduler = initialize_scheduler(optimizer, config) criterion = nn.CrossEntropyLoss(ignore_index=-1) # Train the network logging.info('===> Start training') best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True if config.resume: checkpoint_fn = config.resume + '/weights.pth' if osp.isfile(checkpoint_fn): logging.info("=> loading checkpoint '{}'".format(checkpoint_fn)) state = torch.load(checkpoint_fn) curr_iter = state['iteration'] + 1 epoch = state['epoch'] d = { k: v for k, v in state['state_dict'].items() if 'map' not in k } model.load_state_dict(d) if config.resume_optimizer: scheduler = initialize_scheduler(optimizer, config, last_step=curr_iter) optimizer.load_state_dict(state['optimizer']) if 'best_val' in state: best_val_miou = state['best_val'] best_val_iter = state['best_val_iter'] logging.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_fn, state['epoch'])) else: raise ValueError( "=> no checkpoint found at '{}'".format(checkpoint_fn)) data_iter = data_loader.__iter__() while is_training: num_class = 20 total_correct_class = torch.zeros(num_class, device=device) total_iou_deno_class = torch.zeros(num_class, device=device) for iteration in range(len(data_loader) // config.iter_size): optimizer.zero_grad() data_time, batch_loss = 0, 0 iter_timer.tic() for sub_iter in range(config.iter_size): # Get training data data = data_iter.next() points, target, sample_weight = data if config.pure_point: sinput = points.transpose(1, 2).cuda().float() # DEBUG: use the discrete coord for point-based ''' feats = torch.unbind(points[:,:,:], dim=0) voxel_size = config.voxel_size coords = torch.unbind(points[:,:,:3]/voxel_size, dim=0) # 0.05 is the voxel-size coords, feats= ME.utils.sparse_collate(coords, feats) # assert feats.reshape([16, 4096, -1]) == points[:,:,3:] points_ = ME.TensorField(features=feats.float(), coordinates=coords, device=device) tmp_voxel = points_.sparse() sinput_ = tmp_voxel.slice(points_) sinput = torch.cat([sinput_.C[:,1:]*config.voxel_size, sinput_.F[:,3:]],dim=1).reshape([config.batch_size, config.num_points, 6]) # sinput = sinput_.F.reshape([config.batch_size, config.num_points, 6]) sinput = sinput.transpose(1,2).cuda().float() # sinput = torch.cat([coords[:,1:], feats],dim=1).reshape([config.batch_size, config.num_points, 6]) # sinput = sinput.transpose(1,2).cuda().float() ''' # For some networks, making the network invariant to even, odd coords is important # coords[:, 1:] += (torch.rand(3) * 100).type_as(coords) # Preprocess input # if config.normalize_color: # feats = feats / 255. - 0.5 # torch.save(points[:,:,:3], './sandbox/tensorfield-c.pth') # torch.save(points_.C, './sandbox/points-c.pth') else: # feats = torch.unbind(points[:,:,3:], dim=0) # WRONG: should also feed in xyz as inupt feature voxel_size = config.voxel_size coords = torch.unbind(points[:, :, :3] / voxel_size, dim=0) # 0.05 is the voxel-size # Normalize the xyz in feature # points[:,:,:3] = points[:,:,:3] / points[:,:,:3].mean() feats = torch.unbind(points[:, :, :], dim=0) coords, feats = ME.utils.sparse_collate(coords, feats) # For some networks, making the network invariant to even, odd coords is important coords[:, 1:] += (torch.rand(3) * 100).type_as(coords) # Preprocess input # if config.normalize_color: # feats = feats / 255. - 0.5 # they are the same points_ = ME.TensorField(features=feats.float(), coordinates=coords, device=device) # points_1 = ME.TensorField(features=feats.float(), coordinates=coords, device=device, quantization_mode=ME.SparseTensorQuantizationMode.UNWEIGHTED_AVERAGE) # points_2 = ME.TensorField(features=feats.float(), coordinates=coords, device=device, quantization_mode=ME.SparseTensorQuantizationMode.RANDOM_SUBSAMPLE) sinput = points_.sparse() data_time += data_timer.toc(False) B, npoint = target.shape # model.initialize_coords(*init_args) soutput = model(sinput) if config.pure_point: soutput = soutput.reshape([B * npoint, -1]) else: soutput = soutput.slice(points_).F # s1 = soutput.slice(points_) # print(soutput.quantization_mode) # soutput.quantization_mode = ME.SparseTensorQuantizationMode.RANDOM_SUBSAMPLE # s2 = soutput.slice(points_) # The output of the network is not sorted target = (target - 1).view(-1).long().to(device) # catch NAN if torch.isnan(soutput).sum() > 0: import ipdb ipdb.set_trace() loss = criterion(soutput, target) if torch.isnan(loss).sum() > 0: import ipdb ipdb.set_trace() loss = (loss * sample_weight.to(device)).mean() # Compute and accumulate gradient loss /= config.iter_size batch_loss += loss.item() loss.backward() # print(model.input_mlp[0].weight.max()) # print(model.input_mlp[0].weight.grad.max()) # Update number of steps optimizer.step() scheduler.step() # CLEAR CACHE! torch.cuda.empty_cache() data_time_avg.update(data_time) iter_time_avg.update(iter_timer.toc(False)) pred = get_prediction(data_loader.dataset, soutput, target) score = precision_at_one(pred, target, ignore_label=-1) losses.update(batch_loss, target.size(0)) scores.update(score, target.size(0)) # Calc the iou for l in range(num_class): total_correct_class[l] += ((pred == l) & (target == l)).sum() total_iou_deno_class[l] += (((pred == l) & (target >= 0)) | (target == l)).sum() if curr_iter >= config.max_iter: is_training = False break if curr_iter % config.stat_freq == 0 or curr_iter == 1: lrs = ', '.join( ['{:.3e}'.format(x) for x in scheduler.get_lr()]) debug_str = "===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format( epoch, curr_iter, len(data_loader) // config.iter_size, losses.avg, lrs) debug_str += "Score {:.3f}\tData time: {:.4f}, Iter time: {:.4f}".format( scores.avg, data_time_avg.avg, iter_time_avg.avg) logging.info(debug_str) # Reset timers data_time_avg.reset() iter_time_avg.reset() # Write logs losses.reset() scores.reset() # Save current status, save before val to prevent occational mem overflow if curr_iter % config.save_freq == 0: checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter, save_inter=True) # Validation: # for point-based should use alternate dataloader for eval # if curr_iter % config.val_freq == 0: # val_miou = test_points(model, val_data_loader, None, curr_iter, config, transform_data_fn) # if val_miou > best_val_miou: # best_val_miou = val_miou # best_val_iter = curr_iter # checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter, # "best_val") # logging.info("Current best mIoU: {:.3f} at iter {}".format(best_val_miou, best_val_iter)) # # Recover back # model.train() # End of iteration curr_iter += 1 IoU = (total_correct_class) / (total_iou_deno_class + 1e-6) logging.info('train point avg class IoU: %f' % ((IoU).mean() * 100.)) epoch += 1 # Explicit memory cleanup if hasattr(data_iter, 'cleanup'): data_iter.cleanup() # Save the final model checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter) test_points(model, val_data_loader, config) if val_miou > best_val_miou: best_val_miou = val_miou best_val_iter = curr_iter checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter, "best_val") logging.info("Current best mIoU: {:.3f} at iter {}".format( best_val_miou, best_val_iter))
def train(model, data_loader, val_data_loader, config, transform_data_fn=None): device = get_torch_device(config.is_cuda) # Set up the train flag for batch normalization model.train() # Configuration data_timer, iter_timer = Timer(), Timer() data_time_avg, iter_time_avg = AverageMeter(), AverageMeter() regs, losses, scores = AverageMeter(), AverageMeter(), AverageMeter() optimizer = initialize_optimizer(model.parameters(), config) scheduler = initialize_scheduler(optimizer, config) criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label) # Train the network logging.info('===> Start training') best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True if config.resume: # Test loaded ckpt first v_loss, v_score, v_mAP, v_mIoU = test(model, val_data_loader, config) checkpoint_fn = config.resume + '/weights.pth' if osp.isfile(checkpoint_fn): logging.info("=> loading checkpoint '{}'".format(checkpoint_fn)) state = torch.load(checkpoint_fn) curr_iter = state['iteration'] + 1 epoch = state['epoch'] # we skip attention maps because the shape won't match because voxel number is different # e.g. copyting a param with shape (23385, 8, 4) to (43529, 8, 4) d = { k: v for k, v in state['state_dict'].items() if 'map' not in k } # handle those attn maps we don't load from saved dict for k in model.state_dict().keys(): if k in d.keys(): continue d[k] = model.state_dict()[k] model.load_state_dict(d) if config.resume_optimizer: scheduler = initialize_scheduler(optimizer, config, last_step=curr_iter) optimizer.load_state_dict(state['optimizer']) if 'best_val' in state: best_val_miou = state['best_val'] best_val_iter = state['best_val_iter'] logging.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_fn, state['epoch'])) else: raise ValueError( "=> no checkpoint found at '{}'".format(checkpoint_fn)) data_iter = data_loader.__iter__() if config.dataset == "SemanticKITTI": num_class = 19 config.normalize_color = False config.xyz_input = False val_freq_ = config.val_freq config.val_freq = config.val_freq * 10 elif config.dataset == "S3DIS": num_class = 13 config.normalize_color = False config.xyz_input = False val_freq_ = config.val_freq config.val_freq = config.val_freq elif config.dataset == "Nuscenes": num_class = 16 config.normalize_color = False config.xyz_input = False val_freq_ = config.val_freq config.val_freq = config.val_freq * 50 else: num_class = 20 val_freq_ = config.val_freq while is_training: total_correct_class = torch.zeros(num_class, device=device) total_iou_deno_class = torch.zeros(num_class, device=device) for iteration in range(len(data_loader) // config.iter_size): optimizer.zero_grad() data_time, batch_loss = 0, 0 iter_timer.tic() if curr_iter >= config.max_iter: # if curr_iter >= max(config.max_iter, config.epochs*(len(data_loader) // config.iter_size): is_training = False break elif curr_iter >= config.max_iter * (2 / 3): config.val_freq = val_freq_ * 2 # valid more freq on lower half for sub_iter in range(config.iter_size): # Get training data data_timer.tic() pointcloud = None if config.return_transformation: coords, input, target, _, _, pointcloud, transformation, _ = data_iter.next( ) else: coords, input, target, _, _, _ = data_iter.next( ) # ignore unique_map and inverse_map if config.use_aux: assert target.shape[1] == 2 aux = target[:, 1] target = target[:, 0] else: aux = None # For some networks, making the network invariant to even, odd coords is important coords[:, 1:] += (torch.rand(3) * 100).type_as(coords) # Preprocess input if config.normalize_color: input[:, :3] = input[:, :3] / input[:, :3].max() - 0.5 coords_norm = coords[:, 1:] / coords[:, 1:].max() - 0.5 # cat xyz into the rgb feature if config.xyz_input: input = torch.cat([coords_norm, input], dim=1) sinput = SparseTensor(input, coords, device=device) starget = SparseTensor( target.unsqueeze(-1).float(), coordinate_map_key=sinput.coordinate_map_key, coordinate_manager=sinput.coordinate_manager, device=device ) # must share the same coord-manager to align for sinput data_time += data_timer.toc(False) # model.initialize_coords(*init_args) # d = {} # d['c'] = sinput.C # d['l'] = starget.F # torch.save('./plot/test-label.pth') # import ipdb; ipdb.set_trace() # Set up profiler # memory_profiler = CUDAMemoryProfiler( # [model, criterion], # filename="cuda_memory.profile" # ) # sys.settrace(memory_profiler) # threading.settrace(memory_profiler) # with torch.autograd.profiler.profile(enabled=True, use_cuda=True, record_shapes=False, profile_memory=True) as prof0: if aux is not None: soutput = model(sinput, aux) elif config.enable_point_branch: soutput = model(sinput, iter_=curr_iter / config.max_iter, enable_point_branch=True) else: # label-aux, feed it in as additional reg soutput = model( sinput, iter_=curr_iter / config.max_iter, aux=starget ) # feed in the progress of training for annealing inside the model # The output of the network is not sorted target = target.view(-1).long().to(device) loss = criterion(soutput.F, target.long()) # ====== other loss regs ===== if hasattr(model, 'block1'): cur_loss = torch.tensor([0.], device=device) if hasattr(model.block1[0], 'vq_loss'): if model.block1[0].vq_loss is not None: cur_loss = torch.tensor([0.], device=device) for n, m in model.named_children(): if 'block' in n: cur_loss += m[ 0].vq_loss # m is the nn.Sequential obj, m[0] is the TRBlock logging.info( 'Cur Loss: {}, Cur vq_loss: {}'.format( loss, cur_loss)) loss += cur_loss if hasattr(model.block1[0], 'diverse_loss'): if model.block1[0].diverse_loss is not None: cur_loss = torch.tensor([0.], device=device) for n, m in model.named_children(): if 'block' in n: cur_loss += m[ 0].diverse_loss # m is the nn.Sequential obj, m[0] is the TRBlock logging.info( 'Cur Loss: {}, Cur diverse _loss: {}'.format( loss, cur_loss)) loss += cur_loss if hasattr(model.block1[0], 'label_reg'): if model.block1[0].label_reg is not None: cur_loss = torch.tensor([0.], device=device) for n, m in model.named_children(): if 'block' in n: cur_loss += m[ 0].label_reg # m is the nn.Sequential obj, m[0] is the TRBlock # logging.info('Cur Loss: {}, Cur diverse _loss: {}'.format(loss, cur_loss)) loss += cur_loss # Compute and accumulate gradient loss /= config.iter_size batch_loss += loss.item() loss.backward() # soutput = model(sinput) # Update number of steps if not config.use_sam: optimizer.step() else: optimizer.first_step(zero_grad=True) soutput = model(sinput, iter_=curr_iter / config.max_iter, aux=starget) criterion(soutput.F, target.long()).backward() optimizer.second_step(zero_grad=True) if config.lr_warmup is None: scheduler.step() else: if curr_iter >= config.lr_warmup: scheduler.step() for g in optimizer.param_groups: g['lr'] = config.lr * (iteration + 1) / config.lr_warmup # CLEAR CACHE! torch.cuda.empty_cache() data_time_avg.update(data_time) iter_time_avg.update(iter_timer.toc(False)) pred = get_prediction(data_loader.dataset, soutput.F, target) score = precision_at_one(pred, target, ignore_label=-1) regs.update(cur_loss.item(), target.size(0)) losses.update(batch_loss, target.size(0)) scores.update(score, target.size(0)) # calc the train-iou for l in range(num_class): total_correct_class[l] += ((pred == l) & (target == l)).sum() total_iou_deno_class[l] += (((pred == l) & (target != -1)) | (target == l)).sum() if curr_iter % config.stat_freq == 0 or curr_iter == 1: lrs = ', '.join( ['{:.3e}'.format(x) for x in scheduler.get_lr()]) IoU = ((total_correct_class) / (total_iou_deno_class + 1e-6)).mean() * 100. debug_str = "[{}] ===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format( config.log_dir.split('/')[-2], epoch, curr_iter, len(data_loader) // config.iter_size, losses.avg, lrs) debug_str += "Score {:.3f}\tIoU {:.3f}\tData time: {:.4f}, Iter time: {:.4f}".format( scores.avg, IoU.item(), data_time_avg.avg, iter_time_avg.avg) if regs.avg > 0: debug_str += "\n Additional Reg Loss {:.3f}".format( regs.avg) # print(debug_str) logging.info(debug_str) # Reset timers data_time_avg.reset() iter_time_avg.reset() # Write logs losses.reset() scores.reset() # Save current status, save before val to prevent occational mem overflow if curr_iter % config.save_freq == 0: checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter, save_inter=True) # Validation if curr_iter % config.val_freq == 0: val_miou = validate(model, val_data_loader, None, curr_iter, config, transform_data_fn) if val_miou > best_val_miou: best_val_miou = val_miou best_val_iter = curr_iter checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter, "best_val", save_inter=True) logging.info("Current best mIoU: {:.3f} at iter {}".format( best_val_miou, best_val_iter)) # print("Current best mIoU: {:.3f} at iter {}".format(best_val_miou, best_val_iter)) # Recover back model.train() # End of iteration curr_iter += 1 IoU = (total_correct_class) / (total_iou_deno_class + 1e-6) logging.info('train point avg class IoU: %f' % ((IoU).mean() * 100.)) epoch += 1 # Explicit memory cleanup if hasattr(data_iter, 'cleanup'): data_iter.cleanup() # Save the final model checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter) v_loss, v_score, v_mAP, val_miou = test(model, val_data_loader, config) if val_miou > best_val_miou: best_val_miou = val_miou best_val_iter = curr_iter checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter, "best_val") logging.info("Current best mIoU: {:.3f} at iter {}".format( best_val_miou, best_val_iter))
def train_distill(model, data_loader, val_data_loader, config, transform_data_fn=None): ''' the distillation training some cfgs here ''' # distill_lambda = 1 # distill_lambda = 0.33 distill_lambda = 0.67 # TWO_STAGE=True: Transformer is first trained with L2 loss to match ResNet's activation, and then it fintunes like normal training on the second stage. # TWO_STAGE=False: Transformer trains with combined loss TWO_STAGE = False # STAGE_PERCENTAGE = 0.7 device = get_torch_device(config.is_cuda) # Set up the train flag for batch normalization model.train() # Configuration data_timer, iter_timer = Timer(), Timer() data_time_avg, iter_time_avg = AverageMeter(), AverageMeter() losses, scores = AverageMeter(), AverageMeter() optimizer = initialize_optimizer(model.parameters(), config) scheduler = initialize_scheduler(optimizer, config) criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label) # Train the network logging.info('===> Start training') best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True # TODO: # load the sub-model only # FIXME: some dirty hard-written stuff, only supporting current state tch_model_cls = load_model('Res16UNet18A') tch_model = tch_model_cls(3, 20, config).to(device) # checkpoint_fn = "/home/zhaotianchen/project/point-transformer/SpatioTemporalSegmentation-ScanNet/outputs/ScannetSparseVoxelizationDataset/Res16UNet18A/resnet_base/weights.pth" checkpoint_fn = "/home/zhaotianchen/project/point-transformer/SpatioTemporalSegmentation-ScanNet/outputs/ScannetSparseVoxelizationDataset/Res16UNet18A/Res18A/weights.pth" # voxel-size: 0.05 assert osp.isfile(checkpoint_fn) logging.info("=> loading checkpoint '{}'".format(checkpoint_fn)) state = torch.load(checkpoint_fn) d = {k: v for k, v in state['state_dict'].items() if 'map' not in k} tch_model.load_state_dict(d) if 'best_val' in state: best_val_miou = state['best_val'] best_val_iter = state['best_val_iter'] logging.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_fn, state['epoch'])) if config.resume: raise NotImplementedError # Test loaded ckpt first # checkpoint_fn = config.resume + '/weights.pth' # if osp.isfile(checkpoint_fn): # logging.info("=> loading checkpoint '{}'".format(checkpoint_fn)) # state = torch.load(checkpoint_fn) # curr_iter = state['iteration'] + 1 # epoch = state['epoch'] # d = {k:v for k,v in state['state_dict'].items() if 'map' not in k } # model.load_state_dict(d) # if config.resume_optimizer: # scheduler = initialize_scheduler(optimizer, config, last_step=curr_iter) # optimizer.load_state_dict(state['optimizer']) # if 'best_val' in state: # best_val_miou = state['best_val'] # best_val_iter = state['best_val_iter'] # logging.info("=> loaded checkpoint '{}' (epoch {})".format(checkpoint_fn, state['epoch'])) # else: # raise ValueError("=> no checkpoint found at '{}'".format(checkpoint_fn)) # test after loading the ckpt v_loss, v_score, v_mAP, v_mIoU = test(tch_model, val_data_loader, config) logging.info('Tch model tested, bes_miou: {}'.format(v_mIoU)) data_iter = data_loader.__iter__() while is_training: num_class = 20 total_correct_class = torch.zeros(num_class, device=device) total_iou_deno_class = torch.zeros(num_class, device=device) total_iteration = len(data_loader) // config.iter_size for iteration in range(total_iteration): # NOTE: for single stage distillation, L2 loss might be too large at first # so we added a warmup training that don't use L2 loss if iteration < 0: use_distill = False else: use_distill = True # Stage 1 / Stage 2 boundary if TWO_STAGE: stage_boundary = int(total_iteration * STAGE_PERCENTAGE) optimizer.zero_grad() data_time, batch_loss = 0, 0 iter_timer.tic() for sub_iter in range(config.iter_size): # Get training data data_timer.tic() if config.return_transformation: coords, input, target, _, _, pointcloud, transformation = data_iter.next( ) else: coords, input, target, _, _ = data_iter.next( ) # ignore unique_map and inverse_map if config.use_aux: assert target.shape[1] == 2 aux = target[:, 1] target = target[:, 0] else: aux = None # For some networks, making the network invariant to even, odd coords is important coords[:, 1:] += (torch.rand(3) * 100).type_as(coords) # Preprocess input if config.normalize_color: input[:, :3] = input[:, :3] / 255. - 0.5 coords_norm = coords[:, 1:] / coords[:, 1:].max() - 0.5 # cat xyz into the rgb feature if config.xyz_input: input = torch.cat([coords_norm, input], dim=1) sinput = SparseTensor(input, coords, device=device) # TODO: return both-models # in order to not breaking the valid interface, use a get_loss to get the regsitered loss data_time += data_timer.toc(False) # model.initialize_coords(*init_args) if aux is not None: raise NotImplementedError # flatten ground truth tensor target = target.view(-1).long().to(device) if TWO_STAGE: if iteration < stage_boundary: # Stage 1: train transformer on L2 loss soutput, anchor = model(sinput, save_anchor=True) # Make sure gradient don't flow to teacher model with torch.no_grad(): _, tch_anchor = tch_model(sinput, save_anchor=True) loss = DistillLoss(tch_anchor, anchor) else: # Stage 2: finetune transformer on Cross-Entropy soutput = model(sinput) loss = criterion(soutput.F, target.long()) else: if use_distill: # after warm up soutput, anchor = model(sinput, save_anchor=True) # if pretrained teacher, do not let the grad flow to teacher to update its params with torch.no_grad(): tch_soutput, tch_anchor = tch_model( sinput, save_anchor=True) else: # warming up soutput = model(sinput) # The output of the network is not sorted loss = criterion(soutput.F, target.long()) # Add L2 loss if use distillation if use_distill: distill_loss = DistillLoss(tch_anchor, anchor) * distill_lambda loss += distill_loss # Compute and accumulate gradient loss /= config.iter_size batch_loss += loss.item() loss.backward() # Update number of steps optimizer.step() scheduler.step() # CLEAR CACHE! torch.cuda.empty_cache() data_time_avg.update(data_time) iter_time_avg.update(iter_timer.toc(False)) pred = get_prediction(data_loader.dataset, soutput.F, target) score = precision_at_one(pred, target, ignore_label=-1) losses.update(batch_loss, target.size(0)) scores.update(score, target.size(0)) # calc the train-iou for l in range(num_class): total_correct_class[l] += ((pred == l) & (target == l)).sum() total_iou_deno_class[l] += (((pred == l) & (target != -1)) | (target == l)).sum() if curr_iter >= config.max_iter: is_training = False break if curr_iter % config.stat_freq == 0 or curr_iter == 1: lrs = ', '.join( ['{:.3e}'.format(x) for x in scheduler.get_lr()]) debug_str = "[{}] ===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format( config.log_dir, epoch, curr_iter, len(data_loader) // config.iter_size, losses.avg, lrs) debug_str += "Score {:.3f}\tData time: {:.4f}, Iter time: {:.4f}".format( scores.avg, data_time_avg.avg, iter_time_avg.avg) logging.info(debug_str) if use_distill and not TWO_STAGE: logging.info('Loss {} Distill Loss:{}'.format( loss, distill_loss)) # Reset timers data_time_avg.reset() iter_time_avg.reset() losses.reset() scores.reset() # Save current status, save before val to prevent occational mem overflow if curr_iter % config.save_freq == 0: checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter, save_inter=True) # Validation if curr_iter % config.val_freq == 0: val_miou = validate(model, val_data_loader, None, curr_iter, config, transform_data_fn) if val_miou > best_val_miou: best_val_miou = val_miou best_val_iter = curr_iter checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter, "best_val", save_inter=True) logging.info("Current best mIoU: {:.3f} at iter {}".format( best_val_miou, best_val_iter)) # Recover back model.train() # End of iteration curr_iter += 1 IoU = (total_correct_class) / (total_iou_deno_class + 1e-6) logging.info('train point avg class IoU: %f' % ((IoU).mean() * 100.)) epoch += 1 # Explicit memory cleanup if hasattr(data_iter, 'cleanup'): data_iter.cleanup() # Save the final model checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter) v_loss, v_score, v_mAP, val_miou = test(model, val_data_loader, config) if val_miou > best_val_miou: best_val_miou = val_miou best_val_iter = curr_iter checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter, "best_val") logging.info("Current best mIoU: {:.3f} at iter {}".format( best_val_miou, best_val_iter))
def train(pipeline_model, data_loader, val_data_loader, config): # Set up the train flag for batch normalization pipeline_model.train() num_devices = torch.cuda.device_count() num_devices = min(config.max_ngpu, num_devices) devices = list(range(num_devices)) target_device = devices[0] pipeline_model.to(target_device) if num_devices > 1: pipeline_model = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm( pipeline_model, devices) # Configuration writer = SummaryWriter(logdir=config.log_dir) data_timer, iter_timer = Timer(), Timer() data_time_avg, iter_time_avg = AverageMeter(), AverageMeter() meters = collections.defaultdict(AverageMeter) hists = pipeline_model.initialize_hists() optimizer = pipeline_model.initialize_optimizer(config) scheduler = pipeline_model.initialize_scheduler(optimizer, config) writer = SummaryWriter(logdir=config.log_dir) # Train the network logging.info('===> Start training') best_val, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True if config.resume: if osp.isfile(config.resume): logging.info("=> loading checkpoint '{}'".format(config.resume)) state = torch.load(config.resume) curr_iter = state['iteration'] + 1 epoch = state['epoch'] pipeline_model.load_state_dict(state['state_dict']) if config.resume_optimizer: curr_iter = state['iteration'] + 1 scheduler = pipeline_model.initialize_scheduler( optimizer, config, last_step=curr_iter) pipeline_model.load_optimizer(optimizer, state['optimizer']) if 'best_val' in state: best_val = state['best_val'] best_val_iter = state['best_val_iter'] logging.info("=> loaded checkpoint '{}' (epoch {})".format( config.resume, state['epoch'])) else: logging.info("=> no checkpoint found at '{}'".format( config.resume)) data_iter = data_loader.__iter__() while is_training: for iteration in range(len(data_loader)): pipeline_model.reset_gradient(optimizer) iter_timer.tic() pipelines = parallel.replicate(pipeline_model, devices) # Get training data data_timer.tic() inputs = [] for pipeline, device in zip(pipelines, devices): with torch.cuda.device(device): while True: datum = pipeline.load_datum(data_iter, has_gt=True) num_boxes = sum(box.shape[0] for box in datum['bboxes_coords']) if config.skip_empty_boxes and num_boxes == 0: continue break inputs.append(datum) data_time_avg.update(data_timer.toc(False)) outputs = parallel.parallel_apply(pipelines, [(x, True) for x in inputs], devices=devices) losses = parallel.parallel_apply( [pipeline.loss for pipeline in pipelines], tuple(zip(inputs, outputs)), devices=devices) losses = parallel.gather(losses, target_device) losses = dict([(k, v.mean()) for k, v in losses.items()]) meters, hists = pipeline_model.update_meters(meters, hists, losses) # Compute and accumulate gradient losses['loss'].backward() # Update number of steps pipeline_model.step_optimizer(losses, optimizer, scheduler, iteration) iter_time_avg.update(iter_timer.toc(False)) if curr_iter >= config.max_iter: is_training = False break if curr_iter % config.stat_freq == 0 or curr_iter == 1: lrs = ', '.join([ '{:.3e}'.format(x) for x in scheduler['default'].get_lr() ]) debug_str = "===> Epoch[{}]({}/{}): LR: {}\n".format( epoch, curr_iter, len(data_loader), lrs) debug_str += log_meters(meters, log_perclass_meters=False) debug_str += f"\n data time: {data_time_avg.avg:.3f}" debug_str += f" iter time: {iter_time_avg.avg:.3f}" logging.info(debug_str) # Reset timers data_time_avg.reset() iter_time_avg.reset() # Write logs update_writer(writer, meters, curr_iter, 'training') writer.add_scalar('training/learning_rate', scheduler['default'].get_lr()[0], curr_iter) # Reset meters reset_meters(meters, hists) # Save current status, save before val to prevent occational mem overflow if curr_iter % config.save_freq == 0: checkpoint(pipeline_model, optimizer, epoch, curr_iter, config, best_val, best_val_iter) if config.heldout_save_freq > 0 and curr_iter % config.heldout_save_freq == 0: checkpoint(pipeline_model, optimizer, epoch, curr_iter, config, best_val, best_val_iter, heldout_save=True) # Validation if curr_iter % config.val_freq == 0: if num_devices > 1: unconvert_sync_batchnorm(pipeline_model) best_val, best_val_iter = validate(pipeline_model, val_data_loader, config, writer, curr_iter, best_val, best_val_iter, optimizer, epoch) if num_devices > 1: pipeline_model = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm( pipeline_model, devices) if curr_iter % config.empty_cache_freq == 0: # Clear cache torch.cuda.empty_cache() # End of iteration curr_iter += 1 epoch += 1 # Explicit memory cleanup if hasattr(data_iter, 'cleanup'): data_iter.cleanup() # Save the final model if num_devices > 1: unconvert_sync_batchnorm(pipeline_model) validate(pipeline_model, val_data_loader, config, writer, curr_iter, best_val, best_val_iter, optimizer, epoch) if num_devices > 1: pipeline_model = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm( pipeline_model, devices) checkpoint(pipeline_model, optimizer, epoch, curr_iter, config, best_val, best_val_iter)
def train(self, train_queue, val_queue=None): ''' Given data queues, train the network ''' # Parameter directory save_dir = os.path.join(cfg.DIR.OUT_PATH) if not os.path.exists(save_dir): os.makedirs(save_dir) # Timer for the training op and parallel data loading op. train_timer = Timer() data_timer = Timer() training_losses = [] # Setup learning rates lr_steps = [int(k) for k in cfg.TRAIN.LEARNING_RATES.keys()] #Setup the lr_scheduler self.lr_scheduler = lr_scheduler.MultiStepLR(self.optimizer, lr_steps, gamma=0.1) # gamma为下降系数 start_iter = 0 # Resume training if cfg.TRAIN.RESUME_TRAIN: self.load(cfg.CONST.WEIGHTS) start_iter = cfg.TRAIN.INITIAL_ITERATION if cfg.TRAIN.SHOW_LOSS: # 要动态打印 import matplotlib.pyplot as plot plot.figure(1, figsize=(12, 5)) plot.ion() # Main training loop for train_ind in range(start_iter, cfg.TRAIN.NUM_ITERATION + 1): self.lr_scheduler.step() data_timer.tic() batch_img, batch_voxel = train_queue.get() data_timer.toc() if self.net.is_x_tensor4: batch_img = batch_img[0] # Apply one gradient step train_timer.tic() loss = self.train_loss(batch_img, batch_voxel) train_timer.toc() # print(loss) # training_losses.append(loss.data) 转换为numpy数组 # print(type(loss)) # print(loss.data.numpy()) # print(loss.data.numpy().shape) # print(type(loss.data.numpy())) if (torch.cuda.is_available()): training_losses.append(loss.cpu().data.numpy()) else: training_losses.append(loss.data.numpy()) # Decrease learning rate at certain points if train_ind in lr_steps: #for pytorch optimizer, learning rate can only be set when the optimizer is created #or using torch.optim.lr_scheduler print('Learing rate decreased to %f: ' % cfg.TRAIN.LEARNING_RATES[str(train_ind)]) # ''' # Debugging modules # ''' # Print status, run validation, check divergence, and save model. if train_ind % cfg.TRAIN.PRINT_FREQ == 0: #40 # Print the current loss print('%s Iter: %d Loss: %f' % (datetime.now(), train_ind, loss)) ''' @TODO(dingyadong): loss dynamic Visualization ''' # plot if (train_ind != 0): steps = np.linspace(0, train_ind, train_ind + 1, dtype=np.float32) if cfg.TRAIN.SHOW_LOSS: # 要动态打印 plot.plot(steps, training_losses, 'b-') plot.draw() # plot.pause(0.05) if train_ind % cfg.TRAIN.VALIDATION_FREQ == 0 and val_queue is not None: # Print test loss and params to check convergence every N iterations val_losses = 0 for i in range(cfg.TRAIN.NUM_VALIDATION_ITERATIONS): batch_img, batch_voxel = val_queue.get() val_loss = self.train_loss(batch_img, batch_voxel) val_losses += val_loss var_losses_mean = val_losses / cfg.TRAIN.NUM_VALIDATION_ITERATIONS print('%s Test loss: %f' % (datetime.now(), var_losses_mean)) if train_ind % cfg.TRAIN.NAN_CHECK_FREQ == 0: # Check that the network parameters are all valid nan_or_max_param = max_or_nan(self.net.parameters()) if has_nan(nan_or_max_param): print('NAN detected') break if train_ind % cfg.TRAIN.SAVE_FREQ == 0 and not train_ind == 0: self.save(training_losses, save_dir, train_ind) #loss is a Variable containing torch.FloatTensor of size 1 if loss.data > cfg.TRAIN.LOSS_LIMIT: print("Cost exceeds the threshold. Stop training") break if cfg.TRAIN.SHOW_LOSS: # 要动态打印ƒ plot.ioff() plot.show()
def test(pipeline_model, data_loader, config, has_gt=True): global_timer, data_timer, iter_timer = Timer(), Timer(), Timer() meters = collections.defaultdict(AverageMeter) hists = pipeline_model.initialize_hists() logging.info('===> Start testing') global_timer.tic() data_iter = data_loader.__iter__() max_iter = len(data_loader) # Fix batch normalization running mean and std pipeline_model.eval() # Clear cache (when run in val mode, cleanup training cache) torch.cuda.empty_cache() if config.save_prediction or config.test_original_pointcloud: if config.save_prediction: save_pred_dir = config.save_pred_dir os.makedirs(save_pred_dir, exist_ok=True) else: save_pred_dir = tempfile.mkdtemp() if os.listdir(save_pred_dir): raise ValueError(f'Directory {save_pred_dir} not empty. ' 'Please remove the existing prediction.') with torch.no_grad(): for iteration in range(max_iter): iter_timer.tic() data_timer.tic() datum = pipeline_model.load_datum(data_iter, has_gt=has_gt) data_time = data_timer.toc(False) output_dict = pipeline_model(datum, False) iter_time = iter_timer.toc(False) if config.save_prediction or config.test_original_pointcloud: pipeline_model.save_prediction(datum, output_dict, save_pred_dir, iteration) if config.visualize and iteration % config.visualize_freq == 0: pipeline_model.visualize_predictions(datum, output_dict, iteration) if has_gt: loss_dict = pipeline_model.loss(datum, output_dict) if config.visualize and iteration % config.visualize_freq == 0: pipeline_model.visualize_groundtruth(datum, iteration) loss_dict.update(pipeline_model.evaluate(datum, output_dict)) meters, hists = pipeline_model.update_meters( meters, hists, loss_dict) if iteration % config.test_stat_freq == 0 and iteration > 0: debug_str = "===> {}/{}\n".format(iteration, max_iter) debug_str += log_meters(meters, log_perclass_meters=True) debug_str += f"\n data time: {data_time:.3f} iter time: {iter_time:.3f}" logging.info(debug_str) if iteration % config.empty_cache_freq == 0: # Clear cache torch.cuda.empty_cache() global_time = global_timer.toc(False) debug_str = "===> Final test results:\n" debug_str += log_meters(meters, log_perclass_meters=True) logging.info(debug_str) if config.test_original_pointcloud: pipeline_model.test_original_pointcloud(save_pred_dir) logging.info('Finished test. Elapsed time: {:.4f}'.format(global_time)) # Explicit memory cleanup if hasattr(data_iter, 'cleanup'): data_iter.cleanup() return meters
def train(self, generator_queue, discriminator_queue): ''' Given data queues, train the network ''' # Parameter directory save_dir = os.path.join(cfg.DIR.OUT_PATH) if not os.path.exists(save_dir): os.makedirs(save_dir) # Timer for the training op and parallel data loading op. train_timer = Timer() data_timer = Timer() start_iter = 0 # Resume training if cfg.TRAIN.RESUME_TRAIN: self.net.load(cfg.CONST.WEIGHTS) start_iter = cfg.TRAIN.INITIAL_ITERATION # Setup learning rates lr = cfg.TRAIN.DEFAULT_LEARNING_RATE lr_steps = [int(k) for k in cfg.TRAIN.LEARNING_RATES.keys()] print('Set the learning rate to %f.' % lr) gen_lr = lr discriminator_losses = [] generator_losses = [] mask_losses = [] generator_idx = 0 voxel_loss = 0 generator_loss = 0 mask_loss = 0 discriminator_loss = 0 # Main training loop for train_ind in range(start_iter, cfg.TRAIN.NUM_ITERATION + 1): self.net.noise.set_value(max(1 - float(train_ind) / 20000., 1e-8)) data_timer.tic() gen_img, gen_camera, _ = generator_queue.get() data_timer.toc() data_timer.tic() _, _, disc_voxel = discriminator_queue.get() data_timer.toc() # Apply one gradient step train_timer.tic() if self.net.discriminator_loss is not None: error_F, error_R = self.evaluate_discriminator( gen_img, gen_camera, disc_voxel) if error_F > 0.2 or error_R > 0.2: self.set_lr(gen_lr / 100.) discriminator_loss = self.discriminator_train_loss( gen_img, gen_camera, disc_voxel) discriminator_losses.append(discriminator_loss) self.set_lr(gen_lr) results = self.generator_train_loss(gen_img, gen_camera) generator_loss = results[0] generator_losses.append(generator_loss) generator_idx += 1 mask_loss = results[1] mask_losses.append(mask_loss) activations = results[2:] train_timer.toc() # Decrease learning rate at certain points if train_ind in lr_steps: # edict only takes string for key. Hacky way gen_lr = np.float(cfg.TRAIN.LEARNING_RATES[str(train_ind)]) print('Learing rate decreased to %f: ' % gen_lr) # Debugging modules # # Print status, run validation, check divergence, and save model. if train_ind % cfg.TRAIN.PRINT_FREQ == 0: # Print the current loss get_mean = lambda x, y: np.mean(x) if x else y print("""%s Iter %d: Discriminator loss %f, Generator """ """loss %f, Mask loss %f""" % (datetime.now(), train_ind, get_mean(discriminator_losses, discriminator_loss), get_mean(generator_losses, generator_loss), get_mean(mask_losses, mask_loss))) discriminator_losses = [] generator_losses = [] mask_losses = [] if train_ind % cfg.TRAIN.NAN_CHECK_FREQ == 0: # Check that the network parameters are all valid max_param = max_or_nan(self.net.all_params) if np.isnan(max_param): print('NAN detected') break if train_ind % cfg.TRAIN.SAVE_FREQ == 0 and not train_ind == 0: self.save(mask_losses, save_dir, train_ind)