def __init__( self, n_heads, d_model, dropout_rate=0.0, skip_term_b=False, share_qvk_proj=False, ): super(MultiHeadedSelfAttentionWithRelPos, self).__init__( n_heads, d_model, dropout_rate, share_qvk_proj ) self.d_model = d_model self.share_qvk_proj = share_qvk_proj self.skip_term_b = skip_term_b self.nheads = n_heads self.d_k = d_model // n_heads self.qvk_proj = nn.Linear( d_model, d_model if self.share_qvk_proj else d_model * 3 ) self.pos_proj = nn.Linear(d_model, d_model, bias=False) self.posu = nn.Parameter(flow.Tensor(1, 1, n_heads, self.d_k)) self.posv = nn.Parameter(flow.Tensor(1, 1, n_heads, self.d_k))
def __init__(self, features, eps=1e-6): super(LayerNorm, self).__init__() self.eps = eps self.weight = nn.Parameter( flow.Tensor(flow.ones(features, dtype=flow.float32))) self.bias = nn.Parameter( flow.Tensor(flow.zeros(features, dtype=flow.float32)))
def __init__(self, input_sz, hidden_sz): super().__init__() self.input_sz = input_sz self.hidden_size = hidden_sz self.W = nn.Parameter(flow.Tensor(input_sz, hidden_sz * 4)) self.U = nn.Parameter(flow.Tensor(hidden_sz, hidden_sz * 4)) self.bias = nn.Parameter(flow.Tensor(hidden_sz * 4)) self.init_weights()
def __init__(self, dim, eps=1e-05, elementwise_affine=True): super(GlobalChannelLayerNorm, self).__init__() self.eps = eps self.normalized_dim = dim self.elementwise_affine = elementwise_affine if elementwise_affine: self.beta = nn.Parameter(flow.zeros(dim, 1)) self.gamma = nn.Parameter(flow.ones(dim, 1)) else: self.register_parameter("weight", None) self.register_parameter("bias", None)
def __init__(self, input_size, hidden_size): super().__init__() low, upper = -sqrt(1 / hidden_size), sqrt(1 / hidden_size) self.input_size = input_size self.hidden_size = hidden_size self.inp_W = nn.Parameter(flow.Tensor(input_size, hidden_size * 3)) self.hid_W = nn.Parameter(flow.Tensor(hidden_size, hidden_size * 3)) self.inp_b = nn.Parameter(flow.Tensor(hidden_size * 3)) self.hid_b = nn.Parameter(flow.Tensor(hidden_size * 3)) self.init_weight(low, upper)
def _test_convtranspose1d_bias_true(test_case, device): np_arr = np.array([[[0.54925832, -0.64144184, 0.15213189]]]) weight = np.ones((1, 2, 3)) bias = np.array([0.16849578, 0.1509564]) test_out_data = np.array( [ [ [0.71775407, 0.07631224, 0.22844413, -0.32081416, 0.32062766], [0.7002147, 0.05877288, 0.21090476, -0.3383535, 0.3030883], ] ] ) test_out_grad = np.array([[[6.0, 6.0, 6.0]]]) input_flow = flow.tensor( np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True ) m_f = nn.ConvTranspose1d(1, 2, 3, stride=1, bias=True) m_f.weight.data = flow.tensor(weight, dtype=flow.float32) m_f.bias = nn.Parameter(flow.Tensor(bias)) m_f = m_f.to(device) out_flow = m_f(input_flow) test_case.assertTrue(np.allclose(out_flow.numpy(), test_out_data, 1e-06, 1e-06)) out_flow = out_flow.sum() out_flow.backward() test_case.assertTrue( np.allclose(input_flow.grad.numpy(), test_out_grad, 1e-06, 1e-06) )
def __init__(self, num_patches, emb_dim, dropout_rate=0.1): super(PositionEmbs, self).__init__() self.pos_embedding = nn.Parameter( flow.tensor(np.random.randn(1, num_patches + 1, emb_dim), dtype=flow.float32)) if dropout_rate > 0: self.dropout = nn.Dropout(dropout_rate) else: self.dropout = None
def __init__(self, hidden_size, vocab_size, hidden_act=nn.GELU()): super().__init__() self.hidden_size = hidden_size self.transform = BertPredictionHeadTransform(hidden_size, hidden_act) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder = nn.Linear(hidden_size, vocab_size, bias=False) self.output_bias = nn.Parameter(flow.zeros(vocab_size)) # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.output_bias
def __init__( self, model, input_size, output_size, num_experts, noisy_gating=True, k=4 ): super(MoE, self).__init__() self.noisy_gating = noisy_gating self.num_experts = num_experts self.output_size = output_size self.input_size = input_size self.k = k # instantiate experts self.experts = nn.ModuleList([model for i in range(self.num_experts)]) self.w_gate = nn.Parameter( flow.zeros(input_size, num_experts), requires_grad=True ) self.w_noise = nn.Parameter( flow.zeros(input_size, num_experts), requires_grad=True ) self.softplus = nn.Softplus() self.softmax = nn.Softmax(1) assert self.k <= self.num_experts
def __init__(self, emb_dim, scale_learnable=False, dropout=0.0): """Initialize class. :param int d_model: embedding dim :param float dropout_rate: dropout rate :param int max_len: maximum input length """ super(PositionalEncoding, self).__init__() self.emb_dim = emb_dim self.xscale = math.sqrt(self.emb_dim) self.dropout = nn.Dropout(p=dropout) self.scale_learnable = scale_learnable if self.scale_learnable: self.alpha = nn.Parameter(flow.tensor(1.0))
def __init__( self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0.0, proj_drop=0.0, ): super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim**-0.5 # define a parameter table of relative position bias # Author zzk: we add trunc normal here! self.relative_position_bias_table = nn.Parameter( flow.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH self.relative_position_bias_table.trunc_normal_(std=0.02) # get pair-wise relative position index for each token inside the window coords_h = flow.arange(self.window_size[0]) coords_w = flow.arange(self.window_size[1]) coords = flow.stack(flow.meshgrid(*[coords_h, coords_w])) # 2, Wh, Ww coords_flatten = flow.flatten(coords, 1) # 2, Wh*Ww relative_coords = (coords_flatten[:, :, None] - coords_flatten[:, None, :]) # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute(1, 2, 0) # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww self.register_buffer("relative_position_index", relative_position_index) self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.softmax = nn.Softmax(dim=-1)
def _test_convtranspose1d_group_bias_true(test_case, device): np_arr = np.array( [ [ [-0.77808793, 0.99824008, 0.57340066], [1.46278707, -0.65234252, -1.13087643], ], [ [0.76053973, 0.62332447, -1.17157106], [0.60291466, -0.0472167, 0.89986403], ], ] ) weight = np.ones((2, 1, 3)) bias = np.array([0.32546719, 0.14995032]) test_out_data = np.array( [ [ [-0.45262071, 0.54561937, 1.11902, 1.897108, 0.89886785], [1.6127374, 0.96039486, -0.1704815, -1.6332686, -0.9809261], ], [ [1.0860069, 1.7093314, 0.5377604, -0.22277936, -0.8461038], [0.75286496, 0.70564824, 1.6055121, 1.0025976, 1.0498143], ], ] ) test_out_grad = np.array( [[[3.0, 3.0, 3.0], [3.0, 3.0, 3.0]], [[3.0, 3.0, 3.0], [3.0, 3.0, 3.0]]] ) input_flow = flow.tensor( np_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True ) m_f = nn.ConvTranspose1d(2, 2, 3, stride=1, groups=2, bias=True) m_f.weight.data = flow.tensor(weight, dtype=flow.float32) m_f.bias = nn.Parameter(flow.Tensor(bias)) m_f = m_f.to(device) out_flow = m_f(input_flow) test_case.assertTrue(np.allclose(out_flow.numpy(), test_out_data, 1e-06, 1e-06)) out_flow = out_flow.sum() out_flow.backward() test_case.assertTrue( np.allclose(input_flow.grad.numpy(), test_out_grad, 1e-06, 1e-06) )
def __init__( self, image_size=(256, 256), patch_size=(16, 16), emb_dim=768, mlp_dim=3072, num_heads=12, num_layers=12, num_classes=1000, attn_dropout_rate=0.0, dropout_rate=0.1, feat_dim=None, ): super(VisionTransformer, self).__init__() h, w = image_size # embedding layer fh, fw = patch_size gh, gw = h // fh, w // fw num_patches = gh * gw self.embedding = nn.Conv2d(3, emb_dim, kernel_size=(fh, fw), stride=(fh, fw)) # class token self.cls_token = nn.Parameter(flow.zeros(1, 1, emb_dim)) # transformer self.transformer = Encoder( num_patches=num_patches, emb_dim=emb_dim, mlp_dim=mlp_dim, num_layers=num_layers, num_heads=num_heads, dropout_rate=dropout_rate, attn_dropout_rate=attn_dropout_rate, ) # classfier self.classifier = nn.Linear(emb_dim, num_classes)
def __init__(self, features, eps=1e-6): super(LayerNorm, self).__init__() self.gamma = nn.Parameter(flow.ones(features)) self.beta = nn.Parameter(flow.zeros(features)) self.eps = eps
def __init__(self, nf, nx): super(Conv1D, self).__init__() self.nf = nf self.weight = nn.Parameter(flow.Tensor(nx, nf)) nn.init.normal_(self.weight, mean=0, std=0.02) self.bias = nn.Parameter(flow.zeros(nf))
def __init__(self, hidden_size, eps=1e-6): super(LayerNorm, self).__init__() self.eps = eps self.weight = nn.Parameter(flow.ones(hidden_size, dtype=flow.float32)) self.bias = nn.Parameter(flow.zeros(hidden_size, dtype=flow.float32))
def _load_of_weight(param: flow.nn.Parameter, data: np.ndarray): assert param.shape == data.shape param.copy_(nn.Parameter(flow.tensor(data, dtype=flow.float32)))
def __init__( self, out_channels, kernel_size, sample_rate=16000, in_channels=1, stride=1, padding=0, dilation=1, bias=False, groups=1, min_low_hz=50, min_band_hz=50, ): super(SincConv_fast, self).__init__() if in_channels != 1: msg = ( "SincConv only support one input channel (here, in_channels = {%i})" % (in_channels) ) raise ValueError(msg) self.out_channels = out_channels self.kernel_size = kernel_size # Forcing the filters to be odd (i.e, perfectly symmetrics) if kernel_size % 2 == 0: self.kernel_size = self.kernel_size + 1 self.stride = stride self.padding = padding self.dilation = dilation if bias: raise ValueError("SincConv does not support bias.") if groups > 1: raise ValueError("SincConv does not support groups.") self.sample_rate = sample_rate self.min_low_hz = min_low_hz self.min_band_hz = min_band_hz # initialize filterbanks such that they are equally spaced in Mel scale low_hz = 30 high_hz = self.sample_rate / 2 - (self.min_low_hz + self.min_band_hz) mel = np.linspace( self.to_mel(low_hz), self.to_mel(high_hz), self.out_channels + 1 ) hz = self.to_hz(mel) # filter lower frequency (out_channels, 1) self.low_hz_ = nn.Parameter(flow.Tensor(hz[:-1]).reshape(-1, 1)) # filter frequency band (out_channels, 1) self.band_hz_ = nn.Parameter(flow.Tensor(np.diff(hz)).reshape(-1, 1)) # Hamming window n_lin = flow.Tensor( np.linspace(0, (self.kernel_size / 2) - 1, int((self.kernel_size / 2))) ) self.window_ = 0.54 - 0.46 * flow.cos(2 * math.pi * n_lin / self.kernel_size) # (1, kernel_size/2) n = (self.kernel_size - 1) / 2.0 self.n_ = ( 2 * math.pi * flow.Tensor(np.arange(-n, 0).reshape(1, -1) / self.sample_rate) )
def __init__(self, options): super(MLP, self).__init__() self.input_dim = int(options["input_dim"]) self.fc_lay = options["fc_lay"] self.fc_drop = options["fc_drop"] self.fc_use_batchnorm = options["fc_use_batchnorm"] self.fc_use_laynorm = options["fc_use_laynorm"] self.fc_use_laynorm_inp = options["fc_use_laynorm_inp"] self.fc_use_batchnorm_inp = options["fc_use_batchnorm_inp"] self.fc_act = options["fc_act"] self.wx = nn.ModuleList([]) self.bn = nn.ModuleList([]) self.ln = nn.ModuleList([]) self.act = nn.ModuleList([]) self.drop = nn.ModuleList([]) # input layer normalization if self.fc_use_laynorm_inp: self.ln0 = LayerNorm(self.input_dim) # input batch normalization if self.fc_use_batchnorm_inp: self.bn0 = nn.BatchNorm1d([self.input_dim], momentum=0.05) self.N_fc_lay = len(self.fc_lay) current_input = self.input_dim # Initialization of hidden layers for i in range(self.N_fc_lay): # dropout self.drop.append(nn.Dropout(p=self.fc_drop[i])) # activation self.act.append(act_fun(self.fc_act[i])) add_bias = True # layer norm initialization self.ln.append(LayerNorm(self.fc_lay[i])) self.bn.append(nn.BatchNorm1d(self.fc_lay[i], momentum=0.05)) if self.fc_use_laynorm[i] or self.fc_use_batchnorm[i]: add_bias = False # Linear operations self.wx.append(nn.Linear(current_input, self.fc_lay[i], bias=add_bias)) # weight initialization self.wx[i].weight = nn.Parameter( flow.Tensor(self.fc_lay[i], current_input).uniform_( -np.sqrt(0.01 / (current_input + self.fc_lay[i])), np.sqrt(0.01 / (current_input + self.fc_lay[i])), ) ) self.wx[i].bias = nn.Parameter(flow.zeros(self.fc_lay[i])) current_input = self.fc_lay[i]
def __init__( self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.1, norm_layer=nn.LayerNorm, ape=False, patch_norm=True, use_checkpoint=False, **kwargs, ): super().__init__() self.num_classes = num_classes self.num_layers = len(depths) self.embed_dim = embed_dim self.ape = ape self.patch_norm = patch_norm self.num_features = int(embed_dim * 2**(self.num_layers - 1)) self.mlp_ratio = mlp_ratio # split image into non-overlapping patches self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer if self.patch_norm else None, ) num_patches = self.patch_embed.num_patches patches_resolution = self.patch_embed.patches_resolution self.patches_resolution = patches_resolution # absolute position embedding if self.ape: self.absolute_pos_embed = nn.Parameter( flow.zeros(1, num_patches, embed_dim)) # trunc_normal_(self.absolute_pos_embed, std=.02) self.absolute_pos_embed.trunc_normal_(std=0.02) self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth # dpr = [x.item() for x in flow.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule # TODO: here we use numpy, may have little difference with torch.linspace dpr = [x for x in np.linspace(0, drop_path_rate, sum(depths)) ] # stochastic depth decay rule # build layers self.layers = nn.ModuleList() for i_layer in range(self.num_layers): layer = BasicLayer( dim=int(embed_dim * 2**i_layer), input_resolution=( patches_resolution[0] // (2**i_layer), patches_resolution[1] // (2**i_layer), ), depth=depths[i_layer], num_heads=num_heads[i_layer], window_size=window_size, mlp_ratio=self.mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], norm_layer=norm_layer, downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, use_checkpoint=use_checkpoint, ) self.layers.append(layer) self.norm = norm_layer(self.num_features) self.avgpool = nn.AdaptiveAvgPool1d(1) self.head = (nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()) self.apply(self._init_weights)
def __init__(self): super().__init__() self.x = nn.Parameter(flow.tensor([2, 2], dtype=flow.float32), False) self.y = nn.Parameter(flow.tensor([3, 3], dtype=flow.float32), False)