def test(): #declare the main named dimension variables using tsalib api #recall these values anywhere in the program using `get_dim_vars` from tsalib import dim_vars dim_vars('Batch(b):10 Length(t):100 Hidden(d):1024') from tsanley.dynamic import init_analyzer init_analyzer(trace_func_names=['f*'], show_updates=True) test_func()
def test2(): #declare the named dimension variables using the tsalib api from tsalib import dim_vars dim_vars('Batch(b):10 Length(t):100 Hidden(d):1024') # initialize tsanley's dynamic shape analyzer from tsanley.dynamic import init_analyzer init_analyzer(trace_func_names=['foo'], show_updates=True, debug=False) #check_tsa=True, debug=False test_foo()
def test_resnet (): # declare dim vars: required for checking B, C, Ci, Co = dim_vars('Batch(b):10 Channels(c):3 ChannelsIn(ci) ChannelsOut(co)') H, W, Ex = dim_vars('Height(h):224 Width(w):224 BlockExpansion(e):1') rs18 = resnet18() x: 'bchw' = torch.ones(10, 3, 224, 224) from tsanley.dynamic import init_analyzer #init_analyzer(trace_func_names=['ResNet.forward', 'BasicBlock.forward']) init_analyzer(trace_func_names=['ResNet.forward']) out = rs18.forward(x) print (out.size())
def __init__(self, num_heads: int, tensor_1_dim: int, tensor_1_projected_dim: int = None, tensor_2_dim: int = None, tensor_2_projected_dim: int = None, internal_similarity: SimilarityFunction = DotProductSimilarity()) -> None: super(MultiHeadedSimilarity, self).__init__() self.num_heads = num_heads self._internal_similarity = internal_similarity tensor_1_projected_dim = tensor_1_projected_dim or tensor_1_dim tensor_2_dim = tensor_2_dim or tensor_1_dim tensor_2_projected_dim = tensor_2_projected_dim or tensor_2_dim if tensor_1_projected_dim % num_heads != 0: raise ConfigurationError("Projected dimension not divisible by number of heads: %d, %d" % (tensor_1_projected_dim, num_heads)) if tensor_2_projected_dim % num_heads != 0: raise ConfigurationError("Projected dimension not divisible by number of heads: %d, %d" % (tensor_2_projected_dim, num_heads)) # tsalib dim vars defined locally (to minimize changes from original implementation) # better: define and store them in the config dictionary and use everywhere self.D1, self.D2, self.D1p, self.D2p = dim_vars('D1:{0} D2:{1} D1p:{2} D2p:{3}' .format(tensor_1_dim, tensor_2_dim, tensor_1_projected_dim, tensor_2_projected_dim)) # original impl self._tensor_1_projection = Parameter(torch.Tensor(tensor_1_dim, tensor_1_projected_dim)) self._tensor_2_projection = Parameter(torch.Tensor(tensor_2_dim, tensor_2_projected_dim)) # with tsalib: self._tensor_1_projection: (self.D1, self.D1p) = Parameter(torch.Tensor(self.D1, self.D1p)) self._tensor_2_projection: (self.D2, self.D2p) = Parameter(torch.Tensor(self.D2, self.D2p)) self.reset_parameters()
def forward_old(self, tensor_1: 'b,t,d1', tensor_2: 'b,t,d2') : # This is the original `forward` implementation # note the shape 'surgery' below H = self.num_heads B, T = dim_vars('Batch(b):{tensor_1.shape(0)} T(t):{tensor_1.shape(1)}') D1, D2, D1p, D2p = self.D1, self.D2, self.D1p, self.D2p projected_tensor_1: (B, T, D1p) = torch.matmul(tensor_1, self._tensor_1_projection) projected_tensor_2: (B, T, D2p) = torch.matmul(tensor_2, self._tensor_2_projection) # Here we split the last dimension of the tensors from (..., projected_dim) to # (..., num_heads, projected_dim / num_heads), using tensor.view(). last_dim_size = projected_tensor_1.size(-1) // H new_shape = list(projected_tensor_1.size())[:-1] + [H, last_dim_size] split_tensor_1: (B, T, H, D1p // H) = projected_tensor_1.view(*new_shape) last_dim_size = projected_tensor_2.size(-1) // H new_shape = list(projected_tensor_2.size())[:-1] + [H, last_dim_size] split_tensor_2: (B, T, H, D2p // H) = projected_tensor_2.view(*new_shape) # And then we pass this off to our internal similarity function. Because the similarity # functions don't care what dimension their input has, and only look at the last dimension, # we don't need to do anything special here. It will just compute similarity on the # projection dimension for each head, returning a tensor of shape (..., num_heads). ret : (B, T, H) = self._internal_similarity(split_tensor_1, split_tensor_2) return ret
def __init__(self, block, layers, num_classes=1000): #print (f'block expansion {block.expansion}') self.inplanes = 64 N = dim_vars(f'N(nc):{num_classes}') super(ResNet, self).__init__() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) self.avgpool = nn.AvgPool2d(7, stride=1) self.fc = nn.Linear(512 * block.expansion, num_classes) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0)
def test_decls(): print('\nTest declarations ..') #local declarations print(f'B, C, D = {_B}, {_C}, {_D}') #strict=False allows overwriting previous declarations H, W = dim_vars('Height(h):256 Width(w):256', exists_ok=True) print(f'H, W = {H}, {W}')
def test_decls(): print('\n Test declarations ..') #local declarations print(f'B, C, D = {B}, {C}, {D}') H, W = dim_vars('Height(h):256 Width(w):256', check=False) print(f'H, W = {H}, {W}')
def test_align(): B, T, D = dim_vars('Batch(b):20 SeqLength(t):10 EmbeddingDim(d):100', exists_ok=True) x1 = np.random.randn(D, D) x2 = np.random.randn(B, D, T, D) x1_aligned = alignto((x1, 'dd'), 'bdtd') assert x1_aligned.shape == (1, D, 1, D) print('test align: all assertion passed')
def test_pytorch(): print('\n Test usage with pytorch ..') B, D = dim_vars('Batch:2 EmbedDim:3') import torch a = torch.Tensor([[1., 2., 4.], [3., 6., 9.]]) assert a.size() == (B, D) b = torch.stack([a, a]) print('Asserting b.size() == (2,B,D)') assert b.size() == (2, B, D) c = torch.cat([a, a], dim=1) assert c.size() == (B, D * 2)
def test_decls(): print('\nTest declarations ..') #local declarations print(f'B, C, D = {_B}, {_C}, {_D}') #strict=False allows overwriting previous declarations H, W = dim_vars('Height(h):256 Width(w):256', exists_ok=True) print(f'H, W = {H}, {W}') # test update dim var len H.update_len(1024) print(f'H = {H}') update_dim_vars_len({'h': 512, 'w': 128}) H, W = get_dim_vars('h w') print(f'H, W = {H}, {W}')
def modeling_embedding_lookup(input_ids: 'bti'): # illustrates local dim var usage, i is not declared globaly as dimvar B, T, D = dim_vars('B(b):13 L(t):7 D(d):32') embedding_size = D i = get_shape_list(input_ids)[-1] #num inputs output: 'b*t*i,d' # OLD input_shape: 'bti' = get_shape_list(input_ids) output: 'btd' = tf.reshape(output, input_shape[0:-1] + [input_shape[-1] * embedding_size]) # NEW: crisp one-liner output: 'btd' = warp(output, tfms=f'b*t*{i},d -> b,t,d*{i}', tfm_names='r') assert output.get_shape() == (B, T, D)
def forward(self, tensor_1: 'b,t,d1', tensor_2: 'b,t,d2') : # Cleaner implementation with tsalib #B, T, H defined locally here (to minimize changes to original implementation) # better: define and store them in the config dictionary and use everywhere B, T, H = dim_vars(f'Batch(b):{tensor_1.shape(0)} T(t):{tensor_1.shape(1)} H(h):{self.num_heads}') D1, D2, D1p, D2p = self.D1, self.D2, self.D1p, self.D2p projected_tensor_1: (B, T, D1p) = torch.matmul(tensor_1, self._tensor_1_projection) projected_tensor_2: (B, T, D2p) = torch.matmul(tensor_2, self._tensor_2_projection) split_tensor_1 = projected_tensor_1.view(B, T, H, D1p // H) split_tensor_2 = projected_tensor_2.view(B, T, H, D2p // H) # And then we pass this off to our internal similarity function. Because the similarity # functions don't care what dimension their input has, and only look at the last dimension, # we don't need to do anything special here. It will just compute similarity on the # projection dimension for each head, returning a tensor of shape (..., num_heads). ret : (B, T, H) = self._internal_similarity(split_tensor_1, split_tensor_2) return ret
import torch import torch.nn as nn import torch.utils.model_zoo as model_zoo # Original file: https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py # Updated to add shape annotations (BasicBlock and ResNet modules) import sys sys.path.append('../') from tsalib import dim_vars B, C, Ci, Co = dim_vars('Batch Channels ChannelsIn ChannelsOut') H, W, Ex = dim_vars('Height Width BlockExpansion') __all__ = [ 'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152' ] model_urls = { 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', } def conv3x3(in_planes, out_planes, stride=1): """3x3 convolution with padding""" return nn.Conv2d(in_planes, out_planes,
# - Avoid deeper integration into popular tensor libraries to keep `tsalib` light-weight and avoid backend-inflicted bugs. # # Some popular models (resnet, transformer) annotated/re-written with tsalib can be found in the [models](models/) directory. # # ## Declare dimension variables # Dimension variables model both the `name` and the default `size` of a tensor. # Format: **name(symbol):size** -- `symbol` and `size` are optional # # We can declare dimension variables **globally** (Dimensions used in programs are known upfront and programs don't modify dimension names). # Even better, we can put all these definitions in the Config dictionary. # In[3]: # globals variables prefixed with underscores _B, _T, _D, _K = dim_vars( 'Batch(b):20 SeqLength(t):10 EmbeddingDim(d):100 K(k):1') _C, _H, _W = dim_vars('Channels(c):3 Height(h):256 Width(w):256') # In[4]: def test_decls(): print('\nTest declarations ..') #local declarations print(f'B, C, D = {_B}, {_C}, {_D}') #strict=False allows overwriting previous declarations H, W = dim_vars('Height(h):256 Width(w):256', exists_ok=True) print(f'H, W = {H}, {W}')
import sys sys.path.append('../') import numpy as np from tsalib import dim_vars from tsalib import view_transform as vt from tsalib.ext import _view_transform as _vt from tsalib import permute_transform as pt from tsalib.ext import _permute_transform as _pt from tsalib import expand_transform as et from tsalib import warp B, T, D, K = dim_vars('Batch(b):20 SeqLength(t):10 EmbeddingDim(d):100 K(k):1') C, H, W = dim_vars('C(c):3 H(h):256 W(w):256') def test_reshape(): x: (B, T, D) = np.ones((B, T, D)) #print (f'Testing Reshape: x ({x.shape}):') h = 4 #print (f'Transforming view {(B,T,D)} to {(B,T,h,D//h)} ') #new_shape = vt(src=(B,T,D), to=(B,T,h,D//h), in_shape=x.shape) #assert new_shape == (B, T, h, D//h) x: (B, T, h, D // h) = x.reshape((B, T, h, D // h)) assert x.shape == (B, T, h, D // h) #print (f'After transform, x : {x.shape}\n') print('test_reshape: all assertions hold')
import logging from tsalib import dim_vars # initializing dimension variables dim_vars("Batch(B) Width(W) Height(H) Depth(D) Channel(C)") from pathlib import Path from typing import Tuple, List, Union import SimpleITK as sitk import numpy as np import tensorflow as tf from flask import current_app from model_server.models import NvNet from model_server.utils.dataset import get_files from model_server.utils.processing import prepare_data_for_overlay, load_and_preprocess, overlay, \ load_and_preprocess_label, get_metrics logger: logging.Logger = logging.getLogger() def process_request(dir_path: Path, **kwargs) -> Union[np.ndarray, Tuple[Path, str]]: """ Processes the request. Loads the data to the model and saves the result. Args: dir_path (Path): The path to the directory containing the data files. has to be sent for this to work.
import torch import torch.nn as nn import torch.utils.model_zoo as model_zoo # Original file: https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py # Updated to add shape annotations (BasicBlock and ResNet modules) import sys from tsalib import dim_vars B, C, Ci, Co = dim_vars( 'Batch(b):10 Channels(c):3 ChannelsIn(ci) ChannelsOut(co)') H, W, Ex = dim_vars('Height(h):224 Width(w):224 BlockExpansion(e):1') __all__ = [ 'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152' ] model_urls = { 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', } def conv3x3(in_planes, out_planes, stride=1): """3x3 convolution with padding""" return nn.Conv2d(in_planes, out_planes,
import sys sys.path.append('../') #from typing import List, Sequence, TypeVar from tsalib import dim_var, dim_vars, declare_common_dim_vars # definitions in tsalib/ts.py B, D, V, Dh, T, Te, Td, C, Ci, Co = declare_common_dim_vars() H, W = dim_vars('Height Width') def test_numpy(): import numpy as np a: (B, D) = np.array([[1., 2., 3.], [10., 9., 8.]]) print(f'original array: {(B,D)}: {a.shape}') b: (2, B, D) = np.stack([a, a]) print(f'after stack: {(2,B,D)}: {b.shape}') ax = (2, B, D).index(B) c: (2, D) = np.mean(b, axis=ax) print(f'after mean along axis {B}={ax}: {(2,D)}: {c.shape}') # Supports arithmetic over a combination of dim vars and other Python variables K = W * 2 var1 = 10 print((..., 4, H // 4, K, var1))
import pathlib import re import tarfile import numpy as np import torch from torch.nn import Parameter from allennlp.common.checks import ConfigurationError from allennlp.common.file_utils import cached_path from allennlp.common.from_params import FromParams import sys sys.path.append('../') from tsalib import dim_vars, warp B, T, D, H = dim_vars('Batch SeqLength EmbedDim NumHeads') def gelu(x: torch.Tensor) -> torch.Tensor: return 0.5 * x * (1 + torch.tanh( math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) def swish(x: torch.Tensor) -> torch.Tensor: return x * torch.sigmoid(x) _ACTIVATION_FUNCTIONS = {'relu': torch.nn.ReLU, 'swish': swish, 'gelu': gelu} class LayerNorm(torch.nn.Module):
from tsalib import dim_vars from sklearn.model_selection import ParameterSampler def split(a, n): k, m = divmod(len(a), n) return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)) B, N, E = dim_vars('Batch_Size Graph_Nodes Edge_List') D, H, K, C = dim_vars( 'Node_Input_Features Node_Hidden_Features Graph_Kernel_Dim Num_Classes') hyperparams_grid = { 'model_name': ['universal-graph-embedding'], 'dataset_name': ['DD'], 'save_steps': [200], 'run_on_comet': [True], 'gpu_device': [0], 'hidden_dim': [16, 32, 64], 'num_gconv_layers': [5, 7], 'num_gfc_layers': [2, 4], 'batch_size': [128, 64, 32], 'drop_prob': [0, 0.2], 'num_epochs': [3000] } gen_params_set = 1 for key, val in hyperparams_grid.items(): gen_params_set = gen_params_set * len(val)
import re import tarfile import numpy as np import torch from torch.nn import Parameter from allennlp.common.checks import ConfigurationError from allennlp.common.file_utils import cached_path from allennlp.common.from_params import FromParams import sys sys.path.append('../') from tsalib import dim_vars, warp B, T, D, H = dim_vars('Batch SeqLength EmbedDim(d):768 NumHeads(h):12') def gelu(x: torch.Tensor) -> torch.Tensor: return 0.5 * x * (1 + torch.tanh( math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) def swish(x: torch.Tensor) -> torch.Tensor: return x * torch.sigmoid(x) _ACTIVATION_FUNCTIONS = {'relu': torch.nn.ReLU, 'swish': swish, 'gelu': gelu} class TransformerConfig(NamedTuple):
''' EffNet: AN EFFICIENT STRUCTURE FOR CONVOLUTIONAL NEURAL NETWORKS Implementation in Pytorch of Effnet. https://arxiv.org/abs/1801.06434 ''' import torch import torch.nn as nn from tsanley.dynamic import init_analyzer from tsalib import dim_vars B, C = dim_vars('Batch(b):20 Channels(c):3') H, W = dim_vars('Height(h):32 Width(w):32') N = dim_vars('Nclass(n):10') class Flatten(nn.Module): def forward(self, x): x = x.view(x.size()[0], -1) return x class EffNet(nn.Module): def __init__(self, nb_classes=10, include_top=True, weights=None): super(EffNet, self).__init__() self.block1 = self.make_layers(32, 64) self.block2 = self.make_layers(64, 128) self.block3 = self.make_layers(128, 256) self.flatten = Flatten() self.linear = nn.Linear(4096, nb_classes)
import math import numpy as np import torch import torch.nn as nn from morpho_dataset import MorphoDataset from tsalib import dim_vars B, T, D, H, E = dim_vars('Batch SeqLength Dim NumHeads PositionalEmbedding') T_K, T_Q = dim_vars('SeqLengthKey SeqLengthQuery') S, W, C = dim_vars('Sentences Words Chars') last_char_indices = {'len_k': 0, 'len_q': 0, 'indices': None} def arrange_char_pos_embedding(len_k: int, len_q: int, max_len: int, embedding: (E, D // H), cache: bool = True) -> (T_Q, T_K, D // H): if cache and last_char_indices['len_k'] == len_k and last_char_indices[ 'len_q'] == len_q: indices: (T_Q, T_K) = last_char_indices['indices'] else: k: (T_K) = torch.arange(len_k, device='cuda') q: (T_Q) = torch.arange(len_q, device='cuda') indices: (T_Q, T_K) = k.view(1, -1) - q.view(-1, 1) indices.clamp_(-max_len, max_len).add_(max_len) last_char_indices['len_k'] = len_k last_char_indices['len_q'] = len_q last_char_indices['indices'] = indices
from tsalib import dim_vars dim_vars("Batch(B) Channel(C) Height(H) Width(W)", exists_ok=True)
import sys sys.path.append('../') #from typing import List, Sequence, TypeVar from tsalib import dim_var, dim_vars # global declaration of dimension vars #B, D, V, Dh, T, Te, Td, C, Ci, Co = declare_common_dim_vars() B, C, D, H, W = dim_vars( 'Batch(b):48 Channels(c):3 EmbedDim(d):300 Height(h) Width(w)') def test_decls(): print('\n Test declarations ..') #local declarations print(f'B, C, D = {B}, {C}, {D}') H, W = dim_vars('Height(h):256 Width(w):256', check=False) print(f'H, W = {H}, {W}') def test_arith(): print('\n Test arithmetic ..') # Supports arithmetic over a combination of dim vars and other Python variables K = W * 2 h = 4 print((h, H // h, K, B * 2))
import torch import torch.nn as nn import torch.utils.model_zoo as model_zoo # Original file: https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py # Updated to add shape annotations (BasicBlock and ResNet modules) import sys sys.path.append('../') from tsalib import dim_vars, declare_common_dim_vars B, D, V, Dh, T, Te, Td, C, Ci, Co = declare_common_dim_vars() H, W, C, Ex = dim_vars('Height Width Channels BlockExpansion') __all__ = [ 'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152' ] model_urls = { 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', } def conv3x3(in_planes, out_planes, stride=1): """3x3 convolution with padding""" return nn.Conv2d(in_planes, out_planes,
import sys sys.path.append('../') #from typing import List, Sequence, TypeVar from tsalib import dim_var, dim_vars, declare_common_dim_vars # definitions in tsalib/ts.py B, D, V, Dh, T, Te, Td, C, Ci, Co = declare_common_dim_vars() H, W = dim_vars('Height:256 Width:256') print(H, W) def test_numpy(): import numpy as np a: (B, D) = np.array([[1., 2., 3.], [10., 9., 8.]]) print(f'original array: {(B,D)}: {a.shape}') b: (2, B, D) = np.stack([a, a]) print(f'after stack: {(2,B,D)}: {b.shape}') ax = (2, B, D).index(B) c: (2, D) = np.mean(b, axis=ax) print(f'after mean along axis {B}={ax}: {(2,D)}: {c.shape}') # Supports arithmetic over a combination of dim vars and other Python variables K = W * 2 var1 = 10 print((..., 4, H // 4, K, B * 2, var1))
import sys sys.path.append('../') import numpy as np from tsalib import dim_vars from tsalib import view_transform, permute_transform, expand_transform if __name__ == '__main__': B, T, D, K = dim_vars('Batch SeqLength EmbeddingDim K') H = 4 x: (20, 10, 100) = np.ones((20, 10, 100)) print( f'For x ({x.shape}):\n Transforming view {(B,T,D)} to {(B,T,H,D//H)} ') new_shape = view_transform(src=(B, T, D), to=(B, T, H, D // H), in_shape=x.shape) x: (20, 10, 4, 25) = x.reshape(new_shape) print(f'After transform, x : {x.shape}\n') print(f'Permuting from {(B,T,D,K)} to {(D,T,B,K)}') perm_indices = permute_transform(src=(B, T, D, K), to=(D, T, B, K)) x = x.transpose(perm_indices) print('permutation order:', perm_indices) print(f'After transform, x : {x.shape}\n') x: (B, T, D) = np.ones((20, 10, 100)) x: (B, K, T, D) = x[:, None] print(f'Expanding {(B,K,T,D)} by {(K, K*5)}') expand_shape = expand_transform(src=(B, K, T, D), expansions=[(K, K * 5)],
def __init__(self, vocab_size: 'v', hidden_size: 'd' = 768, num_hidden_layers: 'l' = 12, num_attention_heads: 'n' = 4, intermediate_size: 's' = 3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings: 'p' = 512, type_vocab_size: 'vt' = 16, initializer_range=0.02): """Constructs BertConfig. Args: vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. hidden_size: Size of the encoder layers and the pooler layer. num_hidden_layers: Number of hidden layers in the Transformer encoder. num_attention_heads: Number of attention heads for each attention layer in the Transformer encoder. intermediate_size: The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act: The non-linear activation function (function or string) in the encoder and pooler. hidden_dropout_prob: The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob: The dropout ratio for the attention probabilities. max_position_embeddings: The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). type_vocab_size: The vocabulary size of the `token_type_ids` passed into `BertModel`. initializer_range: The stdev of the truncated_normal_initializer for initializing all weight matrices. """ batch_size, seq_length = 13, 7 B, T = dim_vars( f'batch_size(b):{batch_size} seq_length(t):{seq_length}', exists_ok=True) V, D = dim_vars( f'vocab_size(v):{vocab_size} hidden_size(d):{hidden_size}', exists_ok=True) Nl, N = dim_vars( f'num_hidden_layers(l):{num_hidden_layers} num_attention_heads(n):{num_attention_heads}', exists_ok=True) IS, P, Vt = dim_vars( f'intermediate_size(s):{intermediate_size} max_position_embeddings(p):{max_position_embeddings} type_vocab_size(vt):{type_vocab_size}', exists_ok=True) H = dim_vars(f'size_per_head(h):{hidden_size // num_attention_heads}', exists_ok=True) #print (f'bert config: D = {D}') self.vocab_size: V = vocab_size self.hidden_size: D = hidden_size self.num_hidden_layers: Nl = num_hidden_layers self.num_attention_heads: N = num_attention_heads self.hidden_act = hidden_act self.intermediate_size: IS = intermediate_size self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings: P = max_position_embeddings self.type_vocab_size: Vt = type_vocab_size self.initializer_range = initializer_range