def setup_logging(output_dir=None): """ Sets up the logging for multiple processes. Only enable the logging for the master process, and suppress logging for the non-master processes. """ # Set up logging format. _FORMAT = "[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s" if du.is_master_proc(): # Enable logging for the master process. logging.root.handlers = [] else: # Suppress logging for non-master processes. _suppress_print() logger = logging.getLogger() logger.setLevel(logging.INFO) logger.propagate = False plain_formatter = logging.Formatter( "[%(asctime)s][%(levelname)s] %(filename)s: %(lineno)3d: %(message)s", datefmt="%m/%d %H:%M:%S", ) if du.is_master_proc(): ch = logging.StreamHandler(stream=sys.stdout) ch.setLevel(logging.DEBUG) ch.setFormatter(plain_formatter) logger.addHandler(ch) if output_dir is not None and du.is_master_proc(du.get_world_size()): filename = os.path.join(output_dir, "stdout.log") fh = logging.StreamHandler(_cached_log_stream(filename)) fh.setLevel(logging.DEBUG) fh.setFormatter(plain_formatter) logger.addHandler(fh)
def test(cfg): """ Perform multi-view testing on the pretrained video model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Test with config:") logger.info(cfg) # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, use_train_input=False) cu.load_test_checkpoint(cfg, model) # Create video testing loaders. test_loader = loader.construct_loader(cfg, "test") logger.info("Testing model for {} iterations".format(len(test_loader))) if cfg.DETECTION.ENABLE: assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE or cfg.NUM_GPUS == 0 test_meter = AVAMeter(len(test_loader), cfg, mode="test") else: assert ( test_loader.dataset.num_videos % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS) == 0) # Create meters for multi-view testing. test_meter = TestMeter( test_loader.dataset.num_videos // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, cfg.MODEL.NUM_CLASSES, len(test_loader), cfg.DATA.MULTI_LABEL, cfg.DATA.ENSEMBLE_METHOD, ) # Set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None # # Perform multi-view test on the entire dataset. test_meter = perform_test(test_loader, model, test_meter, cfg, writer) if writer is not None: writer.close()
def visualize(cfg): """ Perform layer weights and activations visualization on the model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ if cfg.TENSORBOARD.ENABLE and cfg.TENSORBOARD.MODEL_VIS.ENABLE: # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Model Visualization with config:") logger.info(cfg) # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, is_train=False) cu.load_test_checkpoint(cfg, model) # Create video testing loaders. vis_loader = loader.construct_loader(cfg, "test") logger.info( "Visualize model for {} data points".format(len(vis_loader)) ) if cfg.DETECTION.ENABLE: assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE # Set up writer for logging to Tensorboard format. if du.is_master_proc(cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None # Run visualization on the model run_visualization(vis_loader, model, cfg, writer) if writer is not None: writer.close()
def __init__(self, max_iter, cfg, tblogger): """ Args: max_iter (int): the max number of iteration of the current epoch. cfg (CfgNode): configs. tblogger (SummaryWriter): training tblogger FIXME """ self._cfg = cfg self.max_iter = max_iter self.iter_timer = Timer() # Current minibatch errors (smoothed over a window). self.mb_top1_err = ScalarMeter(cfg.LOGS.PERIOD) self.mb_top5_err = ScalarMeter(cfg.LOGS.PERIOD) # Min errors (over the full val set). self.min_top1_err = 100.0 self.min_top5_err = 100.0 # Number of misclassified examples. self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0 # tblogger if du.is_master_proc(): self.tblogger = tblogger self.all_preds = [] self.all_labels = []
def save_checkpoint(path_to_job, model, optimizer, epoch, cfg, is_best_epoch=False): """ Save a checkpoint. Args: model (model): model to save the weight to the checkpoint. optimizer (optim): optimizer to save the historical state. epoch (int): current number of epoch of the model. cfg (CfgNode): configs to save. """ # Save checkpoints only from the master process. if not du.is_master_proc(cfg.NUM_GPUS * cfg.NUM_SHARDS): return # Ensure that the checkpoint dir exists. os.makedirs(get_checkpoint_dir(path_to_job), exist_ok=True) # Omit the DDP wrapper in the multi-gpu setting. sd = model.module.state_dict() if cfg.NUM_GPUS > 1 else model.state_dict() # Record the state. checkpoint = { "epoch": epoch, "model_state": sd, "optimizer_state": optimizer.state_dict(), "cfg": cfg.dump(), } # Write the checkpoint. path_to_checkpoint = get_path_to_checkpoint(path_to_job, epoch + 1, is_best_epoch) torch.save(checkpoint, path_to_checkpoint) return path_to_checkpoint
def __init__(self, epoch_iters, cfg): """ Args: epoch_iters (int): the overall number of iterations of one epoch. cfg (CfgNode): configs. """ self._cfg = cfg self.epoch_iters = epoch_iters self.MAX_EPOCH = cfg.SOLVER.MAX_EPOCH * epoch_iters self.iter_timer = Timer() self.loss = ScalarMeter(cfg.LOGS.PERIOD) self.loss_total = 0.0 self.lr = None # Current minibatch errors (smoothed over a window). self.mb_top1_err = ScalarMeter(cfg.LOGS.PERIOD) self.mb_top5_err = ScalarMeter(cfg.LOGS.PERIOD) # Number of misclassified examples. self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0 # tblogger if du.is_master_proc(): self.tblogger = SummaryWriter(self._cfg.LOGS.DIR) else: self.tblogger = None
def inference(cfg): # # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, use_train_input=False) cu.load_test_checkpoint(cfg, model) # Create video loaders. video_loader = loader.construct_loader(cfg, "test") # Create saver saver = Saver(cfg.DATA.PATH_TO_DATA_DIR, video_loader.dataset) model.eval() for i, (inputs, index) in tqdm(enumerate(video_loader), total=len(video_loader)): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) index = index.cuda() feats = model(inputs) # Gather all the predictions across all the devices to perform ensemble. if cfg.NUM_GPUS > 1: feats, index = du.all_gather([feats, index]) saver.save(feats, index) saver.merge()
def log_epoch_stats(self, cur_epoch): """ Log the stats of the current epoch. Args: cur_epoch (int): the number of current epoch. """ stats = { "_type": "val_epoch", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "time_diff": self.iter_timer.seconds(), "gpu_mem": "{:.2f} GB".format(misc.gpu_mem_usage()), "RAM": "{:.2f}/{:.2f} GB".format(*misc.cpu_mem_usage()), } if self._cfg.DATA.MULTI_LABEL: stats["map"] = get_map( torch.cat(self.all_preds).cpu().numpy(), torch.cat(self.all_labels).cpu().numpy(), ) else: top1_err = self.num_top1_mis / self.num_samples top5_err = self.num_top5_mis / self.num_samples self.min_top1_err = min(self.min_top1_err, top1_err) self.min_top5_err = min(self.min_top5_err, top5_err) stats["top1_err"] = top1_err stats["top5_err"] = top5_err stats["min_top1_err"] = self.min_top1_err stats["min_top5_err"] = self.min_top5_err logging.log_json_stats(stats) if du.is_master_proc(): for k, v in stats.items(): if 'err' in k or 'map' in k: self.tblogger.add_scalar('val/{}'.format(k), v, cur_epoch + 1)
def save_checkpoint(path_to_job, model, optimizer, epoch, cfg): """ Save a checkpoint. Args: model (model): model to save the weight to the checkpoint. optimizer (optim): optimizer to save the historical state. epoch (int): current number of epoch of the model. cfg (CfgNode): configs to save. """ # Save checkpoints only from the master process. if not du.is_master_proc(cfg.NUM_GPUS * cfg.NUM_SHARDS): return # Ensure that the checkpoint dir exists. PathManager.mkdirs(get_checkpoint_dir(path_to_job)) # Omit the DDP wrapper in the multi-gpu setting. sd = model.module.state_dict() if cfg.NUM_GPUS > 1 else model.state_dict() normalized_sd = sub_to_normal_bn(sd) # Record the state. checkpoint = { "epoch": epoch, "model_state": normalized_sd, "optimizer_state": optimizer.state_dict(), "cfg": cfg.dump(), } # Write the checkpoint. path_to_checkpoint = get_path_to_checkpoint(path_to_job, epoch + 1) with PathManager.open(path_to_checkpoint, "wb") as f: torch.save(checkpoint, f) return path_to_checkpoint
def test_from_train(model, cfg, cnt=-1): test_loader = loader.construct_loader(cfg, "test") logger.info("Testing model for {} iterations".format(len(test_loader))) if cfg.DETECTION.ENABLE: assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE test_meter = AVAMeter(len(test_loader), cfg, mode="test") else: assert ( len(test_loader.dataset) % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS) == 0 ) # Create meters for multi-view testing. if cfg.TEST.DATASET == 'epickitchens': test_meter = EPICTestMeter( len(test_loader.dataset) // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, cfg.MODEL.NUM_CLASSES, len(test_loader), ) else: test_meter = TestMeter( len(test_loader.dataset) // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, cfg.MODEL.NUM_CLASSES, len(test_loader), ) # # Perform multi-view test on the entire dataset. scores_path = os.path.join(cfg.OUTPUT_DIR, 'scores') if not os.path.exists(scores_path): os.makedirs(scores_path) filename_root = cfg.EPICKITCHENS.TEST_LIST.split('.')[0] if cnt >= 0: file_name = '{}_{}_{}.pkl'.format(filename_root, cnt, cfg.MODEL.MODEL_NAME) else: file_name = '{}_{}_{}.pkl'.format(filename_root, 'test_only', cfg.MODEL.MODEL_NAME) file_path = os.path.join(scores_path, file_name) logger.info(file_path) pickle.dump([], open(file_path, 'wb+')) preds, labels, metadata = perform_test(test_loader, model, test_meter, cfg) if du.is_master_proc(): if cfg.TEST.DATASET == 'epickitchens': results = {'verb_output': preds[0], 'noun_output': preds[1], 'verb_gt': labels[0], 'noun_gt': labels[1], 'narration_id': metadata} scores_path = os.path.join(cfg.OUTPUT_DIR, 'scores') if not os.path.exists(scores_path): os.makedirs(scores_path) pickle.dump(results, open(file_path, 'wb'))
def make_checkpoint_dir(path_to_job): """ Creates the checkpoint directory (if not present already). Args: path_to_job (string): the path to the folder of the current job. """ checkpoint_dir = os.path.join(path_to_job, "checkpoints") # Create the checkpoint dir from the master process if du.is_master_proc() and not os.path.exists(checkpoint_dir): try: os.makedirs(checkpoint_dir) except Exception: pass return checkpoint_dir
def log_json_stats(stats): """ Logs json stats. Args: stats (dict): a dictionary of statistical information to log. """ stats = { k: decimal.Decimal("{:.6f}".format(v)) if isinstance(v, float) else v for k, v in stats.items() } # json_stats = simplejson.dumps(stats, sort_keys=True, use_decimal=True) logstr = "; ".join(["{}: {}".format(k, v) for k, v in stats.items()]) if du.is_master_proc(): logger = get_logger(__name__) logger.info(logstr)
def run_evaluation(categories, groundtruth, detections, verbose=True): """AVA evaluation main logic.""" pascal_evaluator = object_detection_evaluation.PascalDetectionEvaluator( categories) boxes, labels, _ = groundtruth gt_keys = [] pred_keys = [] for image_key in boxes: pascal_evaluator.add_single_ground_truth_image_info( image_key, { standard_fields.InputDataFields.groundtruth_boxes: np.array(boxes[image_key], dtype=float), standard_fields.InputDataFields.groundtruth_classes: np.array(labels[image_key], dtype=int), standard_fields.InputDataFields.groundtruth_difficult: np.zeros(len(boxes[image_key]), dtype=bool), }, ) gt_keys.append(image_key) boxes, labels, scores = detections for image_key in boxes: pascal_evaluator.add_single_detected_image_info( image_key, { standard_fields.DetectionResultFields.detection_boxes: np.array(boxes[image_key], dtype=float), standard_fields.DetectionResultFields.detection_classes: np.array(labels[image_key], dtype=int), standard_fields.DetectionResultFields.detection_scores: np.array(scores[image_key], dtype=float), }, ) pred_keys.append(image_key) metrics = pascal_evaluator.evaluate() if du.is_master_proc(): pprint.pprint(metrics, indent=2) return metrics
def setup_logging(): """ Sets up the logging for multiple processes. Only enable the logging for the master process, and suppress logging for the non-master processes. """ # Set up logging format. _FORMAT = "[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s" if du.is_master_proc(): # Enable logging for the master process. logging.root.handlers = [] logging.basicConfig(level=logging.INFO, format=_FORMAT, stream=sys.stdout) else: # Suppress logging for non-master processes. _suppress_print()
def plotStats(self, stats, ite, typ): if du.is_master_proc() and stats is not None: for k, v in stats.items(): try: val = float(v) nme = "{}_{}".format(typ, k) maxx = self.cfg.METRICS.PLOT_MAX_LIMITS.get(k, None) val = min(val, maxx) if maxx is not None else val minn = self.cfg.METRICS.PLOT_MIN_LIMITS.get(k, None) val = max(val, minn) if minn is not None else val self.logger.log_row( name=nme, iter=ite, val=val, description="{} master proc".format(nme)) except ValueError: pass
def build_trainer(cfg): """ Build training model and its associated tools, including optimizer, dataloaders and meters. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py Returns: model (nn.Module): training model. optimizer (Optimizer): optimizer. train_loader (DataLoader): training data loader. val_loader (DataLoader): validatoin data loader. precise_bn_loader (DataLoader): training data loader for computing precise BN. train_meter (TrainMeter): tool for measuring training stats. val_meter (ValMeter): tool for measuring validation stats. """ # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, use_train_input=True) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") precise_bn_loader = loader.construct_loader(cfg, "train", is_precise_bn=True) # Create meters. train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) return ( model, optimizer, train_loader, val_loader, precise_bn_loader, train_meter, val_meter, )
def test(cfg): """ Perform multi-view testing on the pretrained video model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Test with config:") logger.info(cfg) # Build the video model and print model statistics. model = build_model(cfg) cu.load_test_checkpoint(cfg, model) # Create video testing loaders. test_loader = loader.construct_loader(cfg, "test") logger.info("Testing model for {} iterations".format(len(test_loader))) # Create meters for loss tracking test_meter = TrainMeter(test_loader.dataset.num_videos, cfg) # Set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS ): writer = tb.TensorboardWriter(cfg) else: writer = None # # Perform multi-view test on the entire dataset. test_meter = perform_test(test_loader, model, test_meter, cfg, writer) if writer is not None: writer.close()
def experiment(cfg): # Setup logging format. logging.setup_logging() # Print config. logger.info("Infer with config:") logger.info(cfg) # Build the SlowFast model and print its statistics model = build_model(cfg) if du.is_master_proc(): misc.log_model_info(model, cfg, is_train=False) # load weights if cfg.INFERENCE.WEIGHTS_FILE_PATH != "": cu.load_checkpoint(cfg.INFERENCE.WEIGHTS_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.INFERENCE.WEIGHTS_TYPE == "caffe2") else: raise FileNotFoundError("Model weights file could not be found") perform_inference(model, cfg)
def log_iter_stats(self, cur_epoch, cur_iter): """ log the stats of the current iteration. Args: cur_epoch (int): the number of current epoch. cur_iter (int): the number of current iteration. """ if (cur_iter + 1) % self._cfg.LOGS.PERIOD != 0: return eta_sec = self.iter_timer.seconds() * ( self.MAX_EPOCH - (cur_epoch * self.epoch_iters + cur_iter + 1)) eta = str(datetime.timedelta(seconds=int(eta_sec))) stats = { "_type": "train_iter", "epoch": "{}/{}".format(cur_epoch + 1, self._cfg.SOLVER.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters), "time_diff": self.iter_timer.seconds(), "eta": eta, "loss": self.loss.get_win_median(), "lr": self.lr, "gpu_mem": "{:.2f} GB".format(misc.gpu_mem_usage()), } if not self._cfg.DATA.MULTI_LABEL: stats["top1_err"] = self.mb_top1_err.get_win_median() stats["top5_err"] = self.mb_top5_err.get_win_median() logging.log_json_stats(stats) if du.is_master_proc(): iters = cur_iter + 1 + self.epoch_iters * cur_epoch for k, v in stats.items(): if 'err' in k or 'loss' in k: self.tblogger.add_scalar('train/{}'.format(k), v, iters) elif k == 'eta': self.tblogger.add_scalar('other/eta', eta_sec, iters) elif k == 'epoch': self.tblogger.add_scalar('other/epoch', cur_epoch + 1, iters) elif k == 'lr': self.tblogger.add_scalar('other/lr', v, iters) else: continue
def setup_logger(save_dir): logger = logging.getLogger('slowfast') logger.setLevel(logging.DEBUG) logger.propogate = False # don't log results for the non-master process if not du.is_master_proc(): _suppress_print() return None formatter = logging.Formatter( "%(asctime)s [%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s") ch = logging.StreamHandler(stream=sys.stdout) ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(ch) filename = os.path.join(save_dir, '{}.log'.format(datetime.now())) fh = logging.FileHandler(filename) fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger.addHandler(fh) return logger
def test(cfg, cnt=-1): """ Perform multi-view testing on the pretrained video model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() # # Perform multi-view test on the entire dataset. scores_path = os.path.join(cfg.OUTPUT_DIR, 'scores') if not os.path.exists(scores_path): os.makedirs(scores_path) filename_root = cfg.EPICKITCHENS.TEST_LIST.split('.')[0] if cnt >= 0: file_name = '{}_{}_{}.pkl'.format(filename_root, cnt, cfg.MODEL.MODEL_NAME) else: file_name = '{}_{}_{}.pkl'.format(filename_root, 'test_only', cfg.MODEL.MODEL_NAME) file_path = os.path.join(scores_path, file_name) logger.info(file_path) # Print config. # if cnt < 0: # logger.info("Test with config:") # logger.info(cfg) # Build the video model and print model statistics. model = build_model(cfg) if cfg.EPICKITCHENS.USE_BBOX: model.module.load_weight_slowfast() # if du.is_master_proc(): # misc.log_model_info(model, cfg, is_train=False) # Load a checkpoint to test if applicable. if cfg.TEST.CHECKPOINT_FILE_PATH != "": logger.info("Load from given checkpoint file.") cu.load_checkpoint( cfg.TEST.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.TEST.CHECKPOINT_TYPE == "caffe2", ) elif cu.has_checkpoint(cfg.OUTPUT_DIR): logger.info("Load from last checkpoint.") last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1) elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current # checkpoint folder, try to load checkpint from # TRAIN.CHECKPOINT_FILE_PATH and test it. cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) else: # raise NotImplementedError("Unknown way to load checkpoint.") logger.info("Testing with random initialization. Only for debugging.") # Create video testing loaders. if cfg.TEST.EXTRACT_FEATURES_MODE != "" and cfg.TEST.EXTRACT_FEATURES_MODE in ["test","train","val"]: test_loader = loader.construct_loader(cfg, cfg.TEST.EXTRACT_FEATURES_MODE) else: test_loader = loader.construct_loader(cfg, "test") logger.info("Testing model for {} iterations".format(len(test_loader))) if cfg.DETECTION.ENABLE: assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE test_meter = AVAMeter(len(test_loader), cfg, mode="test") else: assert ( len(test_loader.dataset) % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS) == 0 ) # Create meters for multi-view testing. if cfg.TEST.DATASET == 'epickitchens': test_meter = EPICTestMeter( len(test_loader.dataset) // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, cfg.MODEL.NUM_CLASSES, len(test_loader), ) else: test_meter = TestMeter( len(test_loader.dataset) // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, cfg.MODEL.NUM_CLASSES, len(test_loader), ) pickle.dump([], open(file_path, 'wb+')) if cfg.TEST.EXTRACT_FEATURES: preds, labels, metadata, x_feat_list = perform_test(test_loader, model, test_meter, cfg) else: preds, labels, metadata = perform_test(test_loader, model, test_meter, cfg) if du.is_master_proc(): if cfg.TEST.DATASET == 'epickitchens': results = {'verb_output': preds[0], 'noun_output': preds[1], 'verb_gt': labels[0], 'noun_gt': labels[1], 'narration_id': metadata} scores_path = os.path.join(cfg.OUTPUT_DIR, 'scores') if not os.path.exists(scores_path): os.makedirs(scores_path) pickle.dump(results, open(file_path, 'wb')) if cfg.TEST.EXTRACT_FEATURES: pid = cfg.EPICKITCHENS.FEATURE_VID.split("_")[0] if not os.path.exists(os.path.join(cfg.TEST.EXTRACT_FEATURES_PATH, pid)): os.mkdir(os.path.join(cfg.TEST.EXTRACT_FEATURES_PATH, pid)) if not cfg.TEST.EXTRACT_MSTCN_FEATURES and cfg.TEST.EXTRACT_FEATURES: arr_slow = torch.cat(x_feat_list[0], dim=0).numpy() arr_fast = torch.cat(x_feat_list[1], dim=0).numpy() print(arr_slow.shape, arr_fast.shape) fpath_feat = os.path.join(cfg.TEST.EXTRACT_FEATURES_PATH, pid, '{}.pkl'.format(cfg.EPICKITCHENS.FEATURE_VID)) with open(fpath_feat,'wb+') as f: pickle.dump([arr_slow, arr_fast], f) elif cfg.TEST.EXTRACT_MSTCN_FEATURES and cfg.TEST.EXTRACT_FEATURES: fpath_feat = os.path.join(cfg.TEST.EXTRACT_FEATURES_PATH, pid, '{}.npy'.format(cfg.EPICKITCHENS.FEATURE_VID)) with open(fpath_feat,'wb+') as f: arr = torch.cat(x_feat_list, dim=0).numpy() print(arr.shape) np.save(f, arr)
def visualize(cfg): """ Perform layer weights and activations visualization on the model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ if cfg.TENSORBOARD.ENABLE and (cfg.TENSORBOARD.MODEL_VIS.ENABLE or cfg.TENSORBOARD.WRONG_PRED_VIS.ENABLE): # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Model Visualization with config:") logger.info(cfg) # Build the video model and print model statistics. model = build_model(cfg) model.eval() if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, use_train_input=False) cu.load_test_checkpoint(cfg, model) # Create video testing loaders. vis_loader = loader.construct_loader(cfg, "test") if cfg.DETECTION.ENABLE: assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE or cfg.NUM_GPUS == 0 # Set up writer for logging to Tensorboard format. if du.is_master_proc(cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None if cfg.TENSORBOARD.PREDICTIONS_PATH != "": assert not cfg.DETECTION.ENABLE, "Detection is not supported." logger.info( "Visualizing class-level performance from saved results...") if writer is not None: with g_pathmgr.open(cfg.TENSORBOARD.PREDICTIONS_PATH, "rb") as f: preds, labels = pickle.load(f, encoding="latin1") writer.plot_eval(preds, labels) if cfg.TENSORBOARD.MODEL_VIS.ENABLE: if cfg.TENSORBOARD.MODEL_VIS.GRAD_CAM.ENABLE: assert ( not cfg.DETECTION.ENABLE ), "Detection task is currently not supported for Grad-CAM visualization." if cfg.MODEL.ARCH in cfg.MODEL.SINGLE_PATHWAY_ARCH: assert ( len(cfg.TENSORBOARD.MODEL_VIS.GRAD_CAM.LAYER_LIST) == 1 ), "The number of chosen CNN layers must be equal to the number of pathway(s), given {} layer(s).".format( len(cfg.TENSORBOARD.MODEL_VIS.GRAD_CAM.LAYER_LIST)) elif cfg.MODEL.ARCH in cfg.MODEL.MULTI_PATHWAY_ARCH: assert ( len(cfg.TENSORBOARD.MODEL_VIS.GRAD_CAM.LAYER_LIST) == 2 ), "The number of chosen CNN layers must be equal to the number of pathway(s), given {} layer(s).".format( len(cfg.TENSORBOARD.MODEL_VIS.GRAD_CAM.LAYER_LIST)) else: raise NotImplementedError( "Model arch {} is not in {}".format( cfg.MODEL.ARCH, cfg.MODEL.SINGLE_PATHWAY_ARCH + cfg.MODEL.MULTI_PATHWAY_ARCH, )) logger.info("Visualize model analysis for {} iterations".format( len(vis_loader))) # Run visualization on the model run_visualization(vis_loader, model, cfg, writer) if cfg.TENSORBOARD.WRONG_PRED_VIS.ENABLE: logger.info("Visualize Wrong Predictions for {} iterations".format( len(vis_loader))) perform_wrong_prediction_vis(vis_loader, model, cfg) if writer is not None: writer.close()
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() if du.is_master_proc(): if cfg.ENABLE_WANDB: wandb.login() wandb.init(project='bbox', entity='slowfast') # Print config. # logger.info("Train with config:") # logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = build_model(cfg) if cfg.EPICKITCHENS.USE_BBOX and not cu.has_checkpoint(cfg.OUTPUT_DIR): slow_fast_model = SlowFast(cfg) if cfg.EPICKITCHENS.USE_BBOX and cfg.EPICKITCHENS.LOAD_SLOWFAST_PRETRAIN: if cfg.EPICKITCHENS.SLOWFAST_PRETRAIN_CHECKPOINT_FILE_PATH != "": _ = cu.load_checkpoint( cfg.EPICKITCHENS.SLOWFAST_PRETRAIN_CHECKPOINT_FILE_PATH, slow_fast_model, False, optimizer=None, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=False, ) # cfg.TRAIN.CHECKPOINT_TYPE == "caffe2" logger.info("Load from slowfast.") if cfg.NUM_GPUS > 1: model.module.load_weight_slowfast(slow_fast_model) else: model.load_weight_slowfast(slow_fast_model) # if du.is_master_proc(): # misc.log_model_info(model, cfg, is_train=True) if cfg.BN.FREEZE: model.freeze_fn('bn_parameters') if cfg.EPICKITCHENS.USE_BBOX and cfg.EPICKITCHENS.FREEZE_BACKBONE: model.freeze_fn('slowfast_bbox') # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint(cfg.OUTPUT_DIR): logger.info("Load from last checkpoint.") last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) checkpoint_epoch = cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer) start_epoch = checkpoint_epoch + 1 elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "" and not cfg.TRAIN.FINETUNE: logger.info("Load from given checkpoint file.") checkpoint_epoch = cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, optimizer, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) start_epoch = checkpoint_epoch + 1 elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "" and cfg.TRAIN.FINETUNE: logger.info("Load from given checkpoint file. Finetuning.") _ = cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) start_epoch = 0 else: start_epoch = 0 # Create the video train and val loaders. if cfg.TRAIN.DATASET != 'epickitchens' or not cfg.EPICKITCHENS.TRAIN_PLUS_VAL: train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") logger.info("Train loader size: {}".format(len(train_loader))) logger.info("Val loader size: {}".format(len(val_loader))) else: train_loader = loader.construct_loader(cfg, "train+val") val_loader = loader.construct_loader(cfg, "val") # Create meters. if cfg.DETECTION.ENABLE: train_meter = AVAMeter(len(train_loader), cfg, mode="train") val_meter = AVAMeter(len(val_loader), cfg, mode="val") else: if cfg.TRAIN.DATASET == 'epickitchens': train_meter = EPICTrainMeterSimple(len(train_loader), cfg) val_meter = EPICValMeterSimple(len(val_loader), cfg) else: train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) cnt = 0 # eval_epoch(val_loader, model, val_meter, 0, cfg, cnt) # test_from_train(model, cfg, cnt=cnt) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. cnt = train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg, cnt) # Compute precise BN stats. if cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0: calculate_and_update_precise_bn(train_loader, model, cfg.BN.NUM_BATCHES_PRECISE) # Save a checkpoint. if cu.is_checkpoint_epoch(cur_epoch, cfg.TRAIN.CHECKPOINT_PERIOD): cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if misc.is_eval_epoch(cfg, cur_epoch): is_best_epoch = eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, cnt) if is_best_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg, is_best_epoch=is_best_epoch)
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = model_builder.build_model(cfg) if du.is_master_proc(): misc.log_model_info(model) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint(cfg.OUTPUT_DIR): logger.info("Load from last checkpoint.") last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) checkpoint_epoch = cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer) start_epoch = checkpoint_epoch + 1 elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": logger.info("Load from given checkpoint file.") checkpoint_epoch = cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, optimizer, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) start_epoch = checkpoint_epoch + 1 else: start_epoch = 0 # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") # Create meters. if cfg.DETECTION.ENABLE: train_meter = AVAMeter(len(train_loader), cfg, mode="train") val_meter = AVAMeter(len(val_loader), cfg, mode="val") else: train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg) # Compute precise BN stats. if cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0: calculate_and_update_precise_bn(train_loader, model, cfg.BN.NUM_BATCHES_PRECISE) # Save a checkpoint. if cu.is_checkpoint_epoch(cur_epoch, cfg.TRAIN.CHECKPOINT_PERIOD): cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if misc.is_eval_epoch(cfg, cur_epoch): eval_epoch(val_loader, model, val_meter, cur_epoch, cfg)
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) if du.get_rank()==0 and du.is_master_proc(num_gpus=cfg.NUM_GPUS): writer = SummaryWriter(log_dir=cfg.OUTPUT_DIR) else: writer = None if du.get_rank()==0 and du.is_master_proc(num_gpus=cfg.NUM_GPUS) and not cfg.DEBUG: tags = [] if 'TAGS' in cfg and cfg.TAGS !=[]: tags=list(cfg.TAGS) neptune.set_project('Serre-Lab/motion') ###################### overrides = sys.argv[1:] overrides_dict = {} for i in range(len(overrides)//2): overrides_dict[overrides[2*i]] = overrides[2*i+1] overrides_dict['dir'] = cfg.OUTPUT_DIR ###################### if 'NEP_ID' in cfg and cfg.NEP_ID != "": session = Session() project = session.get_project(project_qualified_name='Serre-Lab/motion') nep_experiment = project.get_experiments(id=cfg.NEP_ID)[0] else: nep_experiment = neptune.create_experiment (name=cfg.NAME, params=overrides_dict, tags=tags) else: nep_experiment=None # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc(num_gpus=cfg.NUM_GPUS): misc.log_model_info(model, cfg, is_train=True) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint(cfg.OUTPUT_DIR): logger.info("Load from last checkpoint.") last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) checkpoint_epoch = cu.load_checkpoint( last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer ) start_epoch = checkpoint_epoch + 1 elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": logger.info("Load from given checkpoint file.") checkpoint_epoch = cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, optimizer, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) start_epoch = checkpoint_epoch + 1 else: start_epoch = 0 # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") # Create meters. if cfg.DETECTION.ENABLE: train_meter = AVAMeter(len(train_loader), cfg, mode="train") val_meter = AVAMeter(len(val_loader), cfg, mode="val") else: train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, writer, nep_experiment, cfg) # Compute precise BN stats. # if cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0: # calculate_and_update_precise_bn( # train_loader, model, cfg.BN.NUM_BATCHES_PRECISE # ) # Save a checkpoint. if cu.is_checkpoint_epoch(cur_epoch, cfg.TRAIN.CHECKPOINT_PERIOD): cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if misc.is_eval_epoch(cfg, cur_epoch): eval_epoch(val_loader, model, val_meter, cur_epoch, nep_experiment, cfg) if du.get_rank()==0 and du.is_master_proc(num_gpus=cfg.NUM_GPUS) and not cfg.DEBUG: nep_experiment.log_metric('epoch', cur_epoch)
def train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, writer, nep, cfg): """ Perform the video training for one epoch. Args: train_loader (loader): video training loader. model (model): the video model to train. optimizer (optim): the optimizer to perform optimization on the model's parameters. train_meter (TrainMeter): training meters to log the training performance. cur_epoch (int): current epoch of training. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Enable train mode. model.train() train_meter.iter_tic() data_size = len(train_loader) global_iters = data_size*cur_epoch for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader): # Transfer the data to the current GPU device. inputs = inputs.cuda(non_blocking=True) # labels = torch.repeat_interleave(labels,inputs[i].size(1),0) # for i in range(len(inputs)): # if len(inputs[i].shape) > 5: # inputs[i] = inputs[i].view((-1,)+inputs[i].shape[2:]) # labels = labels.cuda() # for key, val in meta.items(): # if isinstance(val, (list,)): # for i in range(len(val)): # val[i] = val[i].cuda(non_blocking=True) # else: # meta[key] = val.cuda(non_blocking=True) # Update the learning rate. lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, global_iters, cfg) optim.set_lr(optimizer, lr) preds = model(inputs) out_keys = list(preds.keys()) total_loss = preds['total_loss'] # check Nan Loss. misc.check_nan_losses(total_loss) # Perform the backward pass. optimizer.zero_grad() total_loss.backward() #################################################################################################################################### # check gradients # if writer is not None and global_iters%cfg.SUMMARY_PERIOD==0: # n_p = model.module.named_parameters() if hasattr(model,'module') else model.named_parameters() # fig = viz_helpers.plot_grad_flow_v2(n_p) # writer.add_figure('grad_flow/grad_flow', fig, global_iters) #################################################################################################################################### # Update the parameters. optimizer.step() losses = [preds[k] for k in out_keys] # Gather all the predictions across all the devices. if cfg.NUM_GPUS > 1: losses = du.all_reduce(losses) losses = [l.item() for l in losses] loss_logs = {} for i in range(len(losses)): loss_logs[out_keys[i]] = losses[i] train_meter.iter_toc() # Update and log stats. train_meter.update_stats( lr, inputs[0].size(0) * cfg.NUM_GPUS, **loss_logs ) if writer is not None and global_iters%cfg.LOG_PERIOD==0: logger.info(model.conv0[2].weight) logger.info(model.conv0[2].bias) for k,v in loss_logs.items(): writer.add_scalar('loss/'+k.strip('loss_'), train_meter.stats[k].get_win_median(), global_iters) if nep is not None and global_iters%cfg.LOG_PERIOD==0: for k,v in loss_logs.items(): nep.log_metric(k.strip('loss_'), train_meter.stats[k].get_win_median()) nep.log_metric('global_iters', global_iters) if global_iters%cfg.SUMMARY_PERIOD==0 and du.get_rank()==0 and du.is_master_proc(num_gpus=cfg.NUM_GPUS): with torch.no_grad(): # logger.info(inputs[i].shape) # sys.stdout.flush() inputs = inputs[:min(3,len(inputs))] if 'masks' in meta: frames = model((inputs, meta['masks'][:min(3,len(inputs))]), extra=['frames'])['frames'] else: frames = model(inputs, extra=['frames'])['frames'] n_rows = inputs.size(2)-1 inputs = inputs.transpose(1,2)[:, -n_rows:] frames = frames.transpose(1,2)[:, -n_rows:] frames = torch.cat([(frames!=inputs)*frames, (frames==inputs)*inputs, torch.zeros_like(frames)], 2) inputs = torch.cat([inputs]*3, 2) # inputs = inputs*inputs.new(cfg.DATA.STD)[None,None,:,None,None]+inputs.new(cfg.DATA.MEAN)[None,None,:,None,None] # frames = frames*frames.new(cfg.DATA.STD)[None,None,:,None,None]+frames.new(cfg.DATA.MEAN)[None,None,:,None,None] images = torch.cat([inputs, frames], 1).reshape((-1,) + inputs.shape[2:]) # grid = tv.utils.make_grid(images, nrow=8, normalize=True) # writer.add_image('predictions', images, global_iters) tv.utils.save_image(images, os.path.join(cfg.OUTPUT_DIR, 'preds_%d.jpg'%global_iters), nrow=n_rows, normalize=True) # del images # del frames # del inputs train_meter.log_iter_stats(cur_epoch, cur_iter) train_meter.iter_tic() global_iters+=1 # Log epoch stats. train_meter.log_epoch_stats(cur_epoch) train_meter.reset()
def test(cfg): """ Perform multi-view testing on the pretrained video model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() # Print config. logger.info("Test with config:") logger.info(cfg) # Build the video model and print model statistics. model = model_builder.build_model(cfg) if du.is_master_proc(): misc.log_model_info(model) # Load a checkpoint to test if applicable. if cfg.TEST.CHECKPOINT_FILE_PATH != "": cu.load_checkpoint( cfg.TEST.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.TEST.CHECKPOINT_TYPE == "caffe2", ) elif cu.has_checkpoint(cfg.OUTPUT_DIR): last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1) elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current # checkpoint folder, try to load checkpint from # TRAIN.CHECKPOINT_FILE_PATH and test it. cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) else: # raise NotImplementedError("Unknown way to load checkpoint.") logger.info("Testing with random initialization. Only for debugging.") # Create video testing loaders. test_loader = loader.construct_loader(cfg, "test") logger.info("Testing model for {} iterations".format(len(test_loader))) if cfg.DETECTION.ENABLE: assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE test_meter = AVAMeter(len(test_loader), cfg, mode="test") else: assert ( len(test_loader.dataset) % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS) == 0 ) # Create meters for multi-view testing. test_meter = TestMeter( len(test_loader.dataset) // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, cfg.MODEL.NUM_CLASSES, len(test_loader), ) # # Perform multi-view test on the entire dataset. perform_test(test_loader, model, test_meter, cfg)
def test(cfg): """ Perform multi-view testing on the pretrained video model. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Test with config:") logger.info(cfg) # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, is_train=False) # Load a checkpoint to test if applicable. if cfg.TEST.CHECKPOINT_FILE_PATH != "": cu.load_checkpoint( cfg.TEST.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.TEST.CHECKPOINT_TYPE == "caffe2", ) elif cu.has_checkpoint(cfg.OUTPUT_DIR): last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1) elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current # checkpoint folder, try to load checkpoint from # TRAIN.CHECKPOINT_FILE_PATH and test it. cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) else: # raise NotImplementedError("Unknown way to load checkpoint.") logger.info("Testing with random initialization. Only for debugging.") # Create video testing loaders. test_loader = loader.construct_loader(cfg, "test") logger.info("Testing model for {} iterations".format(len(test_loader))) assert (len(test_loader.dataset) % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS) == 0) # Create meters for multi-view testing. test_meter = TestMeter( len(test_loader.dataset) // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, cfg.MODEL.NUM_CLASSES, len(test_loader), cfg.DATA.MULTI_LABEL, cfg.DATA.ENSEMBLE_METHOD, ) # Set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None # # Perform multi-view test on the entire dataset. perform_test(test_loader, model, test_meter, cfg, writer) if writer is not None: writer.close()
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Init multigrid. multigrid = None if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE: multigrid = MultigridSchedule() cfg = multigrid.init_multigrid(cfg) if cfg.MULTIGRID.LONG_CYCLE: cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = build_model(cfg) if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, use_train_input=True) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. start_epoch = cu.load_train_checkpoint(cfg, model, optimizer) # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") precise_bn_loader = (loader.construct_loader( cfg, "train", is_precise_bn=True) if cfg.BN.USE_PRECISE_STATS else None) # Create meters. if cfg.DETECTION.ENABLE: train_meter = AVAMeter(len(train_loader), cfg, mode="train") val_meter = AVAMeter(len(val_loader), cfg, mode="val") else: train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) # set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS): writer = tb.TensorboardWriter(cfg) else: writer = None # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): if cfg.MULTIGRID.LONG_CYCLE: cfg, changed = multigrid.update_long_cycle(cfg, cur_epoch) if changed: ( model, optimizer, train_loader, val_loader, precise_bn_loader, train_meter, val_meter, ) = build_trainer(cfg) # Load checkpoint. if cu.has_checkpoint(cfg.OUTPUT_DIR): last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) assert "{:05d}.pyth".format(cur_epoch) in last_checkpoint else: last_checkpoint = cfg.TRAIN.CHECKPOINT_FILE_PATH logger.info("Load from {}".format(last_checkpoint)) cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer) # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer) is_checkp_epoch = (cu.is_checkpoint_epoch( cfg, cur_epoch, None if multigrid is None else multigrid.schedule, )) is_eval_epoch = misc.is_eval_epoch( cfg, cur_epoch, None if multigrid is None else multigrid.schedule) # Compute precise BN stats. if ((is_checkp_epoch or is_eval_epoch) and cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0): calculate_and_update_precise_bn( precise_bn_loader, model, min(cfg.BN.NUM_BATCHES_PRECISE, len(precise_bn_loader)), cfg.NUM_GPUS > 0, ) _ = misc.aggregate_sub_bn_stats(model) # Save a checkpoint. if is_checkp_epoch: cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if is_eval_epoch: eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, writer) if writer is not None: writer.close()
def train_epoch( train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer=None ): """ Perform the video training for one epoch. Args: train_loader (loader): video training loader. model (model): the video model to train. optimizer (optim): the optimizer to perform optimization on the model's parameters. train_meter (TrainMeter): training meters to log the training performance. cur_epoch (int): current epoch of training. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py writer (TensorboardWriter, optional): TensorboardWriter object to writer Tensorboard log. """ # Enable train mode. model.train() # Check if the correct params are set to requires_grad = True assert_requires_grad_correctness(model, du.is_master_proc(), cfg) train_meter.iter_tic() data_size = len(train_loader) np.set_printoptions(suppress=True) for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader): # Transfer the data to the current GPU device. if isinstance(inputs, (list,)): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) labels = labels.cuda() for key, val in meta.items(): if isinstance(val, (list,)): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) if cfg.MODEL.HEAD_ACT == "softmax" and cfg.TRAIN.DATASET == "custom": # We have to change our labels to long tensor labels = labels.type(torch.LongTensor) labels = labels.cuda() # Update the learning rate. lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg) optim.set_lr(optimizer, lr, cfg) if cfg.DETECTION.ENABLE: # Compute the predictions. preds = model(inputs, meta["boxes"], is_train=True) else: # Perform the forward pass. preds = model(inputs) # Explicitly declare reduction to mean. loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean") # Compute the loss. loss = loss_fun(preds, labels) """ if cur_iter % 70 == 0: softmax = torch.nn.Softmax(dim=1) probabilities = softmax(preds) loss_prob = loss_fun(probabilities, labels) preds_numpy = probabilities.cpu().detach().numpy() preds_numpy = np.round(preds_numpy, 4) labels_numpy = labels.cpu().detach().numpy() print("--------------------------") for label, pred in zip (labels_numpy, preds_numpy): print(str(label) + "---->", end= "") print(pred[label]) """ # check Nan Loss. misc.check_nan_losses(loss) # Perform the backward pass. optimizer.zero_grad() loss.backward() # Update the parameters. optimizer.step() # Todo: adjust accordingly if cfg.DETECTION.ENABLE: #and not (cfg.MODEL.HEAD_ACT == "softmax"): if cfg.NUM_GPUS > 1: loss = du.all_reduce([loss])[0] loss = loss.item() train_meter.iter_toc() # Update and log stats. train_meter.update_stats(None, None, None, loss, lr) # write to tensorboard format if available. if writer is not None: writer.add_scalars( {"Train/loss": loss, "Train/lr": lr}, global_step=data_size * cur_epoch + cur_iter, ) else: top1_err, top5_err = None, None if cfg.DATA.MULTI_LABEL: # Gather all the predictions across all the devices. if cfg.NUM_GPUS > 1: [loss] = du.all_reduce([loss]) loss = loss.item() else: # Compute the errors. num_topks_correct = metrics.topks_correct(preds, labels, (1, 5)) top1_err, top5_err = [ (1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct ] # Gather all the predictions across all the devices. if cfg.NUM_GPUS > 1: loss, top1_err, top5_err = du.all_reduce( [loss, top1_err, top5_err] ) # Copy the stats from GPU to CPU (sync point). loss, top1_err, top5_err = ( loss.item(), top1_err.item(), top5_err.item(), ) train_meter.iter_toc() # Update and log stats. train_meter.update_stats( top1_err, top5_err, loss, lr, inputs[0].size(0) * cfg.NUM_GPUS ) # write to tensorboard format if available. if writer is not None: writer.add_scalars( { "Train/loss": loss, "Train/lr": lr, "Train/Top1_err": top1_err, "Train/Top5_err": top5_err, }, global_step=data_size * cur_epoch + cur_iter, ) train_meter.log_iter_stats(cur_epoch, cur_iter) train_meter.iter_tic() # Log epoch stats. train_meter.log_epoch_stats(cur_epoch) train_meter.reset()