Exemplo n.º 1
0
_C.MODEL.TEMPERATURE_SCALING = 1

_C.MODEL.AUTOML = False
_C.MODEL.AUTOML_TRIALS = 1000

_C.SOLVER = CN()
_C.SOLVER.NAME = 'sgd'
_C.SOLVER.LR = 0.0002
_C.SOLVER.WEIGHT_DECAY = 0.00001

_C.SOLVER.SCHEDULER = CN()
_C.SOLVER.SCHEDULER.NAME = 'exponential'
_C.SOLVER.SCHEDULER.EPOCHS = (40, 50)
_C.SOLVER.SCHEDULER.GAMMA = 0.1

_C.DATA = CN()
_C.DATA.SIZE = (299, 299)  #w, h
_C.DATA.SHUFFLE_SIZE = 10
_C.DATA.MEAN = [0.485, 0.456, 0.406]
_C.DATA.STD = [0.229, 0.224, 0.225]

_C.DATA.RANDOM_CROP = False
_C.DATA.RANDOM_CROP_SIZE = (299, 299)

_C.DATA.RANDOM_BRIGHTNESS = False
_C.DATA.RANDOM_BRIGHTNESS_DELTA = 0.5

_C.DATA.RANDOM_CONTRAST = False
_C.DATA.RANDOM_CONTRAST_RANGE = (0.1, 0.1)

_C.DATA.RANDOM_FLIP_LR = True
    def __init__(self,
                 config_file: Optional[str] = None,
                 override_list: List[Any] = []):
        _C = CN()

        # Random seed for NumPy and PyTorch, important for reproducibility.
        _C.RANDOM_SEED = 0
        # Train with Automatic Mixed Precision (native PyTorch).
        _C.AMP = True
        # Set CUDNN deterministic flag (torch.backends.cudnn.deterministic).
        # Setting this will ensure exact results on every run at the cost of
        # little slowdown. Good for debugging.
        _C.CUDNN_DETERMINISTIC = False
        # Set CUDNN benchmark flag (torch.backends.cudnn.benchmark). Enables
        # CUDNN to select fastest implementation for operations based on GPU.
        # May change results (in decimals) on different hardware, but faster
        # to train. Turn off while debugging.
        _C.CUDNN_BENCHMARK = True

        # ---------------------------------------------------------------------
        #   Data paths and parameters related to dataloading.
        # ---------------------------------------------------------------------
        _C.DATA = CN()

        # Path to the dataset root, which structure as per README. Path is
        # assumed to be relative to project root.
        _C.DATA.ROOT = "datasets/coco"
        # Path to .model file generated by ``sentencepiece``.
        _C.DATA.TOKENIZER_MODEL = "datasets/vocab/coco_10k.model"

        # Handy config params for vocab size and indices of special tokens.
        # While these can be picked up from the tokenizer, having these in
        # the config makes it easy to create a model without instantiating too
        # many tokenizer instances (especially when not needed, e.g. model zoo).
        # These must match according to what's present in ``TOKENIZER_VOCAB``
        # and ``TOKENIZER_MODEL`` above.
        _C.DATA.VOCAB_SIZE = 10000
        # Index of out-of-vocabulary (and padding) token.
        _C.DATA.UNK_INDEX = 0
        # Index of the start-of-sentence [SOS] token.
        _C.DATA.SOS_INDEX = 1
        # Index of the end-of-sentence [EOS] token.
        _C.DATA.EOS_INDEX = 2
        # Index of the word masking token. While not used for captioning, having
        # this extra token makes it possible to train an MLM model without
        # re-creating a new vocab mapping.
        _C.DATA.MASK_INDEX = 3

        # Size of the image (square) to crop from original input image.
        _C.DATA.IMAGE_CROP_SIZE = 224
        # Maximum length of input caption (number of tokens).
        # Longer captions will be truncated up to this length.
        _C.DATA.MAX_CAPTION_LENGTH = 30

        # COCO Captions has five captions per image. If ``True``, training will
        # use one random caption per image (data efficiency ablations).
        _C.DATA.USE_SINGLE_CAPTION = False
        # Percentage of dataset to use for training (data efficiency ablations).
        _C.DATA.USE_PERCENTAGE = 100.0

        # List of image transforms (pre-processing and data augmentation) to be
        # applied sequentially (always or randomly) during training and
        # validation. Refer ``virtex/facetories.py`` for all possible transforms.
        _C.DATA.IMAGE_TRANSFORM_TRAIN = [
            "random_resized_crop",
            "horizontal_flip",
            "color_jitter",
            "normalize",
        ]
        _C.DATA.IMAGE_TRANSFORM_VAL = [
            "smallest_resize",
            "center_crop",
            "normalize",
        ]

        # Hyper-parameters for masked LM pretraining task. These are only used
        # when ``MODEL.NAME`` is "masked_lm".
        _C.DATA.MASKED_LM = CN()
        # Fraction of tokens to choose for masking, this must be less than 1.
        _C.DATA.MASKED_LM.MASK_PROPORTION = 0.15
        # Probability to replace chosen tokens with [MASK] token.
        _C.DATA.MASKED_LM.MASK_PROBABILITY = 0.85
        # Probability to replace chosen tokens with a random token.
        _C.DATA.MASKED_LM.REPLACE_PROBABILITY = 0.10

        # ---------------------------------------------------------------------
        #   Model architecture: visual backbone and textual head.
        # ---------------------------------------------------------------------
        _C.MODEL = CN()

        # Name of model, based on pretraining task.
        # Possible choices: {"token_classification", "multilabel_classification",
        # "captioning", "bicaptioning", "masked_lm", "virtex"}
        _C.MODEL.NAME = "virtex"

        _C.MODEL.VISUAL = CN()
        # Name of visual backbone. Possible choices: {"blind", "torchvision"}
        # Models from torchvision can be specified as shown below.
        _C.MODEL.VISUAL.NAME = "torchvision::resnet50"
        # Number of channels in pooled spatial features of visual backbone.
        _C.MODEL.VISUAL.FEATURE_SIZE = 2048
        # Whether to load ImageNet pretrained weights into visual backbone.
        _C.MODEL.VISUAL.PRETRAINED = False
        # Whether to keep visual backbone frozen and train only textual head.
        _C.MODEL.VISUAL.FROZEN = False

        _C.MODEL.TEXTUAL = CN()
        # Name of textual head. Set to "none" for MODEL.NAME = "*_classification".
        # Possible choices: {"transformer_postnorm", "transformer_prenorm"}.
        # Architectural hyper-parameters are specified as shown above.
        _C.MODEL.TEXTUAL.NAME = "transformer_postnorm::L1_H2048_A32_F8192"
        # L = Number of layers in the transformer.
        # H = Hidden size of the transformer (embeddings, attention features).
        # A = Number of attention heads in the transformer.
        # F = Size of feedforward layers in the transformer.
        # Typically, we have (A = H / 64) and (F = 4 * H).

        # Dropout probability for embedding, hidden features in textual head.
        _C.MODEL.TEXTUAL.DROPOUT = 0.1

        # ---------------------------------------------------------------------
        #   Optimization hyper-parameters, default values are for pretraining
        #   our best model on bicaptioning task (COCO Captions).
        # ---------------------------------------------------------------------
        _C.OPTIM = CN()

        # Name of optimizer to use. Supported values: {"sgd", "adamw"}.
        # AdamW uses default (beta1, beta2) values from PyTorch.
        _C.OPTIM.OPTIMIZER_NAME = "sgd"
        # Momentum co-efficient for SGD. Ignored for AdamW.
        _C.OPTIM.SGD_MOMENTUM = 0.9
        # Weight decay co-efficient for the optimizer.
        _C.OPTIM.WEIGHT_DECAY = 0.0001
        # Regex pattern of params for which there will be no weight decay.
        _C.OPTIM.NO_DECAY = ".*textual.(embedding|transformer).*(norm.*|bias)"
        # Max gradient norm for clipping to avoid exploding gradients.
        _C.OPTIM.CLIP_GRAD_NORM = 10

        # Wrap our optimizer with Lookahead (https://arxiv.org/abs/1907.08610).
        _C.OPTIM.USE_LOOKAHEAD = False
        _C.OPTIM.LOOKAHEAD_ALPHA = 0.5
        _C.OPTIM.LOOKAHEAD_STEPS = 5

        # We set different learning rates for CNN (visual backbone) and rest of
        # the model. CNN LR is typically much higher for training from scratch.
        # Both LRs undergo same warmup-decay schedules.

        # Total batch size (will be distributed evenly across GPUs).
        _C.OPTIM.BATCH_SIZE = 256
        # Max learning rate for CNN (visual backbone).
        _C.OPTIM.CNN_LR = 0.2
        # Max learning rate for rest of the model.
        _C.OPTIM.LR = 0.001
        # Number of iterations to train for, batches are randomly sampled.
        _C.OPTIM.NUM_ITERATIONS = 500000

        # Number of steps at the start of training for linear LR warmup.
        _C.OPTIM.WARMUP_STEPS = 10000
        # Learning rate annealing schedule for decay after warmup.
        # Possible choices: {"none", "linear", "cosine", "multistep"}.
        _C.OPTIM.LR_DECAY_NAME = "cosine"
        # Steps to decay LR for "multistep" schedule.
        _C.OPTIM.LR_STEPS = []
        # Factor to multiply with LR for "multistep" schedule.
        _C.OPTIM.LR_GAMMA = 0.1

        # Override parameter values from YAML file first, then from override
        # list, then add derived params.
        self._C = _C
        if config_file is not None:
            self._C.merge_from_file(config_file)
        self._C.merge_from_list(override_list)

        self.add_derived_params()

        # Make an instantiated object of this class immutable.
        self._C.freeze()
Exemplo n.º 3
0
# Corresponds to the frame rate reduction ratio, $\alpha$ between the Slow and
# Fast pathways.
_C.SLOWFAST.ALPHA = 8

# Ratio of channel dimensions between the Slow and Fast pathways.
_C.SLOWFAST.FUSION_CONV_CHANNEL_RATIO = 2

# Kernel dimension used for fusing information from Fast pathway to Slow
# pathway.
_C.SLOWFAST.FUSION_KERNEL_SZ = 5


# -----------------------------------------------------------------------------
# Data options
# -----------------------------------------------------------------------------
_C.DATA = CfgNode()

# The path to the data directory.
_C.DATA.PATH_TO_DATA_DIR = ""

# The separator used between path and label.
_C.DATA.PATH_LABEL_SEPARATOR = " "

# Video path prefix if any.
_C.DATA.PATH_PREFIX = ""

# The number of frames of the input clip.
_C.DATA.NUM_FRAMES = 8

# The video sampling rate of the input clip.
_C.DATA.SAMPLING_RATE = 8
Exemplo n.º 4
0
# Corresponds to the frame rate reduction ratio, $\alpha$ between the Slow and
# Fast pathways.
_C.SLOWFAST.ALPHA = 8

# Ratio of channel dimensions between the Slow and Fast pathways.
_C.SLOWFAST.FUSION_CONV_CHANNEL_RATIO = 2

# Kernel dimension used for fusing information from Fast pathway to Slow
# pathway.
_C.SLOWFAST.FUSION_KERNEL_SZ = 5

# -----------------------------------------------------------------------------
# Data options
# -----------------------------------------------------------------------------
_C.DATA = CfgNode(new_allowed=new_allowed)

# The path to the data directory.
_C.DATA.PATH_TO_DATA_DIR = ""

# The separator used between path and label.
_C.DATA.PATH_LABEL_SEPARATOR = " "

# Video path prefix if any.
_C.DATA.PATH_PREFIX = ""

# The spatial crop size of the input clip.
_C.DATA.CROP_SIZE = 224

# The number of frames of the input clip.
_C.DATA.NUM_FRAMES = 8