_C.MODEL.TEMPERATURE_SCALING = 1 _C.MODEL.AUTOML = False _C.MODEL.AUTOML_TRIALS = 1000 _C.SOLVER = CN() _C.SOLVER.NAME = 'sgd' _C.SOLVER.LR = 0.0002 _C.SOLVER.WEIGHT_DECAY = 0.00001 _C.SOLVER.SCHEDULER = CN() _C.SOLVER.SCHEDULER.NAME = 'exponential' _C.SOLVER.SCHEDULER.EPOCHS = (40, 50) _C.SOLVER.SCHEDULER.GAMMA = 0.1 _C.DATA = CN() _C.DATA.SIZE = (299, 299) #w, h _C.DATA.SHUFFLE_SIZE = 10 _C.DATA.MEAN = [0.485, 0.456, 0.406] _C.DATA.STD = [0.229, 0.224, 0.225] _C.DATA.RANDOM_CROP = False _C.DATA.RANDOM_CROP_SIZE = (299, 299) _C.DATA.RANDOM_BRIGHTNESS = False _C.DATA.RANDOM_BRIGHTNESS_DELTA = 0.5 _C.DATA.RANDOM_CONTRAST = False _C.DATA.RANDOM_CONTRAST_RANGE = (0.1, 0.1) _C.DATA.RANDOM_FLIP_LR = True
def __init__(self, config_file: Optional[str] = None, override_list: List[Any] = []): _C = CN() # Random seed for NumPy and PyTorch, important for reproducibility. _C.RANDOM_SEED = 0 # Train with Automatic Mixed Precision (native PyTorch). _C.AMP = True # Set CUDNN deterministic flag (torch.backends.cudnn.deterministic). # Setting this will ensure exact results on every run at the cost of # little slowdown. Good for debugging. _C.CUDNN_DETERMINISTIC = False # Set CUDNN benchmark flag (torch.backends.cudnn.benchmark). Enables # CUDNN to select fastest implementation for operations based on GPU. # May change results (in decimals) on different hardware, but faster # to train. Turn off while debugging. _C.CUDNN_BENCHMARK = True # --------------------------------------------------------------------- # Data paths and parameters related to dataloading. # --------------------------------------------------------------------- _C.DATA = CN() # Path to the dataset root, which structure as per README. Path is # assumed to be relative to project root. _C.DATA.ROOT = "datasets/coco" # Path to .model file generated by ``sentencepiece``. _C.DATA.TOKENIZER_MODEL = "datasets/vocab/coco_10k.model" # Handy config params for vocab size and indices of special tokens. # While these can be picked up from the tokenizer, having these in # the config makes it easy to create a model without instantiating too # many tokenizer instances (especially when not needed, e.g. model zoo). # These must match according to what's present in ``TOKENIZER_VOCAB`` # and ``TOKENIZER_MODEL`` above. _C.DATA.VOCAB_SIZE = 10000 # Index of out-of-vocabulary (and padding) token. _C.DATA.UNK_INDEX = 0 # Index of the start-of-sentence [SOS] token. _C.DATA.SOS_INDEX = 1 # Index of the end-of-sentence [EOS] token. _C.DATA.EOS_INDEX = 2 # Index of the word masking token. While not used for captioning, having # this extra token makes it possible to train an MLM model without # re-creating a new vocab mapping. _C.DATA.MASK_INDEX = 3 # Size of the image (square) to crop from original input image. _C.DATA.IMAGE_CROP_SIZE = 224 # Maximum length of input caption (number of tokens). # Longer captions will be truncated up to this length. _C.DATA.MAX_CAPTION_LENGTH = 30 # COCO Captions has five captions per image. If ``True``, training will # use one random caption per image (data efficiency ablations). _C.DATA.USE_SINGLE_CAPTION = False # Percentage of dataset to use for training (data efficiency ablations). _C.DATA.USE_PERCENTAGE = 100.0 # List of image transforms (pre-processing and data augmentation) to be # applied sequentially (always or randomly) during training and # validation. Refer ``virtex/facetories.py`` for all possible transforms. _C.DATA.IMAGE_TRANSFORM_TRAIN = [ "random_resized_crop", "horizontal_flip", "color_jitter", "normalize", ] _C.DATA.IMAGE_TRANSFORM_VAL = [ "smallest_resize", "center_crop", "normalize", ] # Hyper-parameters for masked LM pretraining task. These are only used # when ``MODEL.NAME`` is "masked_lm". _C.DATA.MASKED_LM = CN() # Fraction of tokens to choose for masking, this must be less than 1. _C.DATA.MASKED_LM.MASK_PROPORTION = 0.15 # Probability to replace chosen tokens with [MASK] token. _C.DATA.MASKED_LM.MASK_PROBABILITY = 0.85 # Probability to replace chosen tokens with a random token. _C.DATA.MASKED_LM.REPLACE_PROBABILITY = 0.10 # --------------------------------------------------------------------- # Model architecture: visual backbone and textual head. # --------------------------------------------------------------------- _C.MODEL = CN() # Name of model, based on pretraining task. # Possible choices: {"token_classification", "multilabel_classification", # "captioning", "bicaptioning", "masked_lm", "virtex"} _C.MODEL.NAME = "virtex" _C.MODEL.VISUAL = CN() # Name of visual backbone. Possible choices: {"blind", "torchvision"} # Models from torchvision can be specified as shown below. _C.MODEL.VISUAL.NAME = "torchvision::resnet50" # Number of channels in pooled spatial features of visual backbone. _C.MODEL.VISUAL.FEATURE_SIZE = 2048 # Whether to load ImageNet pretrained weights into visual backbone. _C.MODEL.VISUAL.PRETRAINED = False # Whether to keep visual backbone frozen and train only textual head. _C.MODEL.VISUAL.FROZEN = False _C.MODEL.TEXTUAL = CN() # Name of textual head. Set to "none" for MODEL.NAME = "*_classification". # Possible choices: {"transformer_postnorm", "transformer_prenorm"}. # Architectural hyper-parameters are specified as shown above. _C.MODEL.TEXTUAL.NAME = "transformer_postnorm::L1_H2048_A32_F8192" # L = Number of layers in the transformer. # H = Hidden size of the transformer (embeddings, attention features). # A = Number of attention heads in the transformer. # F = Size of feedforward layers in the transformer. # Typically, we have (A = H / 64) and (F = 4 * H). # Dropout probability for embedding, hidden features in textual head. _C.MODEL.TEXTUAL.DROPOUT = 0.1 # --------------------------------------------------------------------- # Optimization hyper-parameters, default values are for pretraining # our best model on bicaptioning task (COCO Captions). # --------------------------------------------------------------------- _C.OPTIM = CN() # Name of optimizer to use. Supported values: {"sgd", "adamw"}. # AdamW uses default (beta1, beta2) values from PyTorch. _C.OPTIM.OPTIMIZER_NAME = "sgd" # Momentum co-efficient for SGD. Ignored for AdamW. _C.OPTIM.SGD_MOMENTUM = 0.9 # Weight decay co-efficient for the optimizer. _C.OPTIM.WEIGHT_DECAY = 0.0001 # Regex pattern of params for which there will be no weight decay. _C.OPTIM.NO_DECAY = ".*textual.(embedding|transformer).*(norm.*|bias)" # Max gradient norm for clipping to avoid exploding gradients. _C.OPTIM.CLIP_GRAD_NORM = 10 # Wrap our optimizer with Lookahead (https://arxiv.org/abs/1907.08610). _C.OPTIM.USE_LOOKAHEAD = False _C.OPTIM.LOOKAHEAD_ALPHA = 0.5 _C.OPTIM.LOOKAHEAD_STEPS = 5 # We set different learning rates for CNN (visual backbone) and rest of # the model. CNN LR is typically much higher for training from scratch. # Both LRs undergo same warmup-decay schedules. # Total batch size (will be distributed evenly across GPUs). _C.OPTIM.BATCH_SIZE = 256 # Max learning rate for CNN (visual backbone). _C.OPTIM.CNN_LR = 0.2 # Max learning rate for rest of the model. _C.OPTIM.LR = 0.001 # Number of iterations to train for, batches are randomly sampled. _C.OPTIM.NUM_ITERATIONS = 500000 # Number of steps at the start of training for linear LR warmup. _C.OPTIM.WARMUP_STEPS = 10000 # Learning rate annealing schedule for decay after warmup. # Possible choices: {"none", "linear", "cosine", "multistep"}. _C.OPTIM.LR_DECAY_NAME = "cosine" # Steps to decay LR for "multistep" schedule. _C.OPTIM.LR_STEPS = [] # Factor to multiply with LR for "multistep" schedule. _C.OPTIM.LR_GAMMA = 0.1 # Override parameter values from YAML file first, then from override # list, then add derived params. self._C = _C if config_file is not None: self._C.merge_from_file(config_file) self._C.merge_from_list(override_list) self.add_derived_params() # Make an instantiated object of this class immutable. self._C.freeze()
# Corresponds to the frame rate reduction ratio, $\alpha$ between the Slow and # Fast pathways. _C.SLOWFAST.ALPHA = 8 # Ratio of channel dimensions between the Slow and Fast pathways. _C.SLOWFAST.FUSION_CONV_CHANNEL_RATIO = 2 # Kernel dimension used for fusing information from Fast pathway to Slow # pathway. _C.SLOWFAST.FUSION_KERNEL_SZ = 5 # ----------------------------------------------------------------------------- # Data options # ----------------------------------------------------------------------------- _C.DATA = CfgNode() # The path to the data directory. _C.DATA.PATH_TO_DATA_DIR = "" # The separator used between path and label. _C.DATA.PATH_LABEL_SEPARATOR = " " # Video path prefix if any. _C.DATA.PATH_PREFIX = "" # The number of frames of the input clip. _C.DATA.NUM_FRAMES = 8 # The video sampling rate of the input clip. _C.DATA.SAMPLING_RATE = 8
# Corresponds to the frame rate reduction ratio, $\alpha$ between the Slow and # Fast pathways. _C.SLOWFAST.ALPHA = 8 # Ratio of channel dimensions between the Slow and Fast pathways. _C.SLOWFAST.FUSION_CONV_CHANNEL_RATIO = 2 # Kernel dimension used for fusing information from Fast pathway to Slow # pathway. _C.SLOWFAST.FUSION_KERNEL_SZ = 5 # ----------------------------------------------------------------------------- # Data options # ----------------------------------------------------------------------------- _C.DATA = CfgNode(new_allowed=new_allowed) # The path to the data directory. _C.DATA.PATH_TO_DATA_DIR = "" # The separator used between path and label. _C.DATA.PATH_LABEL_SEPARATOR = " " # Video path prefix if any. _C.DATA.PATH_PREFIX = "" # The spatial crop size of the input clip. _C.DATA.CROP_SIZE = 224 # The number of frames of the input clip. _C.DATA.NUM_FRAMES = 8