def __init__(self, n_ctx, config, scale=False): super(Block, self).__init__() nx = config.n_embd self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon) self.attn = Attention(nx, n_ctx, config, scale) self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon) self.mlp = MLP(4 * nx, config)
def __init__(self, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, layernorm_epsilon, init_method, output_layer_init_method=None): super(GPT2ParallelTransformerLayer, self).__init__() # Set output layer initialization if not provided. if output_layer_init_method is None: output_layer_init_method = init_method # Layernorm on the input data. self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) # Self attention. self.attention = GPT2ParallelSelfAttention( hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method) # Layernorm on the input data. self.post_attention_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) # MLP self.mlp = GPT2ParallelMLP( hidden_size, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method)
def __init__( self, n_heads, embedding_size, ffn_size, attention_dropout=0.0, relu_dropout=0.0, dropout=0.0, activation='relu', variant='aiayn', ): super().__init__() self.dim = embedding_size self.ffn_dim = ffn_size self.variant = variant self.activation = activation self.dropout = nn.Dropout(p=dropout) self.self_attention = MultiHeadAttention(n_heads, embedding_size, dropout=attention_dropout) self.norm1 = LayerNorm(embedding_size, eps=LAYER_NORM_EPS) self.encoder_attention = MultiHeadAttention(n_heads, embedding_size, dropout=attention_dropout) self.norm2 = LayerNorm(embedding_size, eps=LAYER_NORM_EPS) self.ffn = TransformerFFN(embedding_size, ffn_size, relu_dropout=relu_dropout, activation=activation) self.norm3 = LayerNorm(embedding_size, eps=LAYER_NORM_EPS)
def __init__(self, attention_mask_func, mlp_activation_func, init_method, output_layer_init_method, layer_number): args = get_args() super(ParallelTransformerLayer, self).__init__() self.layer_number = layer_number self.apply_residual_connection_post_layernorm \ = args.apply_residual_connection_post_layernorm # Layernorm on the input data. self.input_layernorm = LayerNorm(args.hidden_size, eps=args.layernorm_epsilon) # Self attention. self.attention = ParallelSelfAttention(attention_mask_func, init_method, output_layer_init_method, layer_number) # Layernorm on the input data. self.post_attention_layernorm = LayerNorm(args.hidden_size, eps=args.layernorm_epsilon) # MLP self.mlp = ParallelMLP(mlp_activation_func, init_method, output_layer_init_method)
def __init__(self, blocks, hidden_dim): super().__init__() self.wq = modules.Linear(hidden_dim, hidden_dim) self.wk = modules.Linear(hidden_dim, hidden_dim) self.wv = modules.Linear(hidden_dim, hidden_dim) self.lnq = LayerNorm(hidden_dim) self.lnk = LayerNorm(hidden_dim) self.lnv = LayerNorm(hidden_dim)
def __init__(self, embed_dim, hidden_dim, max_mem, nhead = 1, dropp = 0.1, residual = True, rnn = True, use_attn = True): super().__init__() self.attn = None if use_attn: self.attn = modules.EMultiHeadAttention(embed_dim, hidden_dim, nhead = nhead, dropp = dropp, residual = residual) if rnn: self.rnn = LSTM(embed_dim, embed_dim, batch_first = False) self.ff = modules.Boom(embed_dim, hidden_dim, dropp = dropp, shortcut = True, residual = residual) self.max_mem = max_mem self.lnmid = LayerNorm(embed_dim, eps=1e-12) self.lnmem = LayerNorm(embed_dim, eps=1e-12) self.lnout = LayerNorm(embed_dim, eps=1e-12) self.lnff = LayerNorm(embed_dim, eps=1e-12) self.gelu = modules.GELU()
def __init__(self, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, layernorm_epsilon, init_method, output_layer_init_method=None, num_experts=1): super(GPT2ParallelTransformerLayer, self).__init__() # Set output layer initialization if not provided. if output_layer_init_method is None: output_layer_init_method = init_method # Layernorm on the input data. self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) # Self attention. self.attention = GPT2ParallelSelfAttention( hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method) # Layernorm on the input data. self.post_attention_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) # MLP if num_experts == 1: self.mlp = GPT2ParallelMLP( hidden_size, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method) else: from deepspeed.moe.layer import MoE # Use the DeepSpeed API to use MoE layer and experts. # -- sharding, comm. and parameter handling will be done inside DeepSpeed self.mlp = MoE( hidden_size, output_dropout_prob, GPT2ParallelMLP( hidden_size, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method), num_experts=num_experts)
def __init__(self, config, img_dim, loss="cls", margin=0.2, hard_ratio=0.3, mlp=1): super().__init__(config) self.uniter = UniterModel(config, img_dim) if mlp == 1: self.re_output = nn.Linear(config.hidden_size, 1) elif mlp == 2: self.re_output = nn.Sequential( nn.Linear(config.hidden_size, config.hidden_size), GELU(), LayerNorm(config.hidden_size, eps=1e-12), nn.Linear(config.hidden_size, 1)) else: raise ValueError("MLP restricted to be 1 or 2 layers.") self.loss = loss assert self.loss in ['cls', 'rank'] if self.loss == 'rank': self.margin = margin self.hard_ratio = hard_ratio else: self.crit = nn.CrossEntropyLoss(reduction='none') self.apply(self.init_weights)
def __init__(self, in_dim, out_dim, cache_len): super().__init__() self.cache_len = cache_len # self.lnq = nn.BatchNorm1d(in_dim, in_dim) # self.lnk = nn.BatchNorm1d(in_dim, in_dim) # self.lnv = nn.BatchNorm1d(in_dim, in_dim) self.lnq = LayerNorm(in_dim) self.lnk = LayerNorm(in_dim) self.lnv = LayerNorm(in_dim) self.wq = nn.Linear(in_dim, out_dim) self.wk = nn.Linear(in_dim, out_dim) self.wv = nn.Linear(in_dim, out_dim) self.hyper_net = indicator(in_dim, cache_len, 5)
def __init__(self, nhid, q=True, k=False, v=False, r=False, heads=1, dropout=None): super().__init__() self.qs = nn.Parameter( torch.zeros(size=(1, 1, nhid), dtype=torch.float)) self.ks = nn.Parameter( torch.zeros(size=(1, 1, nhid), dtype=torch.float)) self.vs = nn.Parameter( torch.zeros(size=(1, 1, nhid), dtype=torch.float)) self.qkvs = nn.Parameter( torch.zeros(size=(1, 3, nhid), dtype=torch.float)) self.heads = heads self.nhid = nhid assert nhid % self.heads == 0, "Heads must divide vector evenly" self.drop = nn.Dropout(dropout) if dropout else None self.gelu = GELU() self.q = nn.Linear(nhid, nhid) if q else None self.qln = LayerNorm(nhid, eps=1e-12) self.k = nn.Linear(nhid, nhid) if k else None self.v = nn.Linear(nhid, nhid) if v else None self.r = nn.Linear(2 * nhid, nhid) if r else None self.r_gate = nn.Parameter( torch.ones(size=(1, 1, nhid), dtype=torch.float)) self.vq = None self.vq = Overparam(nhid) # from fastai.text.models import QRNNLayer # self.vq = QRNNLayer(input_size=nhid, hidden_size=nhid, save_prev_x=False, zoneout=0, window=1, output_gate=False, batch_first=False) self.vq_collapsed = False
def __init__(self, config, eps=1e-12): super().__init__() self.intermediate = nn.Linear(config.hidden_size, config.intermediate_size) self.output = nn.Linear(config.intermediate_size, config.hidden_size) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.layer_norm = LayerNorm(config.hidden_size, eps=eps)
def __init__(self, model_config, template): super().__init__() self.model_name = 'mplstm' self.embed_dim = model_config.embed_dim self.hidden_dim = model_config.hidden_dim self.nlayer = model_config.nlayer print('Model [{}] has been built.'.format(self.model_name)) self.cache_len = 200 self.dropi = nn.Dropout(model_config.dropp) self.atts = nn.ModuleList() self.fnns = nn.ModuleList() self.rnns = nn.ModuleList() self.rnln = nn.ModuleList() for i in range(model_config.nlayer): self.atts.append( add_transformer(self.hidden_dim, self.hidden_dim, self.cache_len)) self.fnns.append(Boom(self.hidden_dim)) self.rnns.append( nn.LSTM(self.hidden_dim, self.hidden_dim, model_config.nlayer, batch_first=True)) self.rnln.append(LayerNorm(self.hidden_dim))
def __init__(self, in_dim, out_dim, cache_len): super().__init__() self.cache_len = cache_len self.lnq = LayerNorm(in_dim) self.lnk = LayerNorm(in_dim) self.lnv = LayerNorm(in_dim) self.wq = nn.Linear(in_dim, out_dim) self.wk = nn.Linear(in_dim, out_dim) self.wv = nn.Linear(in_dim, out_dim) self.hyper_kernel = 5 self.hyper_layer = 1 self.hyper_net = indicator(in_dim, cache_len, self.hyper_kernel, self.hyper_layer)
def __init__(self, hidden_size, feat_dim, img_linear_weight): super().__init__() self.net = nn.Sequential(nn.Linear(hidden_size, hidden_size), GELU(), LayerNorm(hidden_size, eps=1e-12)) self.weight = img_linear_weight self.bias = nn.Parameter(torch.zeros(feat_dim))
def __init__(self, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, layernorm_epsilon, init_method, output_layer_init_method=None): super(ParallelDecoderLayer, self).__init__() # Set output layer initialization if not provided. if output_layer_init_method is None: output_layer_init_method = init_method # Layernorm on the input data. self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) # Self attention. self.self_attention = ParallelSelfAttention( hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method) # Layernorm after the self attention. self.post_self_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) self.cross_attention = ParallelCrossAttention( hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method) # Layernorm after the cross attention. self.post_attention_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) # MLP self.mlp = ParallelMLP( hidden_size, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method)
def __init__(self, config, img_dim, num_answer): super().__init__(config) self.roberta = VLXLMRModel(config, img_dim) self.vqa_output = nn.Sequential( nn.Linear(config.hidden_size, config.hidden_size * 2), GELU(), LayerNorm(config.hidden_size * 2, eps=config.layer_norm_eps), nn.Linear(config.hidden_size * 2, num_answer)) self.apply(self.init_weights)
def __init__(self, config, img_dim, num_answer): super().__init__(config) self.bert = UniterModel(config, img_dim) self.vqa_output = nn.Sequential( nn.Linear(config.hidden_size, config.hidden_size * 2), GELU(), LayerNorm(config.hidden_size * 2, eps=1e-12), nn.Linear(config.hidden_size * 2, num_answer)) self.apply(self.init_weights)
def __init__(self, hidden_size, label_dim): super().__init__() self.net = nn.Sequential( nn.Linear(hidden_size, hidden_size), GELU(), LayerNorm(hidden_size, eps=1e-12), nn.Linear(hidden_size, label_dim), )
def __init__(self, config, img_dim): super().__init__(config, img_dim) self.uniter = UniterModel(config, img_dim) self.vcr_output = nn.Sequential( nn.Linear(config.hidden_size, config.hidden_size * 2), nn.ReLU(), LayerNorm(config.hidden_size * 2, eps=1e-12), nn.Linear(config.hidden_size * 2, 2)) self.apply(self.init_weights)
def __init__(self, num_layers, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, checkpoint_activations, checkpoint_num_layers=1, layernorm_epsilon=1.0e-5, init_method_std=0.02, use_scaled_init_for_output_weights=True, use_deepspeed_sparse=None, sparse_mode='all'): super(GPT3ParallelTransformer, self).__init__() if DEEPSPEED_WRAP: from deepspeed.ops.sparse_attention import SparseSelfAttention # Store activation checkpoiting flag. self.checkpoint_activations = checkpoint_activations self.checkpoint_num_layers = checkpoint_num_layers output_layer_init_method = None if use_scaled_init_for_output_weights: output_layer_init_method = scaled_init_method( init_method_std, num_layers) if use_deepspeed_sparse and sparse_mode == 'alternating': print('Use alternating sparse & dense attention layers') def get_layer(layer_num, num_layers): sparsity_config = use_deepspeed_sparse if use_deepspeed_sparse: if sparse_mode == 'alternating' and layer_num % 2: # even layers are dense sparsity_config = None elif sparse_mode == 'top_bottom' and layer_num >= num_layers // 2: # top levels are dense sparsity_config = None return GPT3ParallelTransformerLayer( hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, layernorm_epsilon, unscaled_init_method(init_method_std), output_layer_init_method=output_layer_init_method, use_deepspeed_sparse=sparsity_config) # Transformer layers. self.layers = torch.nn.ModuleList( [get_layer(i, num_layers) for i in range(num_layers)]) # Final layer norm before output. self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) if DEEPSPEED_WRAP: if DEEPSPEED_WRAP.deepspeed.checkpointing.is_configured(): global get_cuda_rng_tracker, checkpoint get_cuda_rng_tracker = DEEPSPEED_WRAP.deepspeed.checkpointing.get_cuda_rng_tracker checkpoint = DEEPSPEED_WRAP.deepspeed.checkpointing.checkpoint
def __init__(self, n_hidden, n_layer, n_head=6, mm=True): super().__init__() self.n_head = n_head self.att = nn.ModuleList() for l in range(n_layer): en = MultiHeadBlock(n_hidden, n_head=n_head) ln = LayerNorm(n_hidden) # ln = apex.normalization.FusedLayerNorm(n_hidden) self.att.append(nn.Sequential(en, ln))
def __init__(self, cfg, position_embeds=True, segment_embeds=True): super().__init__() self.tok_embed = Embedding(cfg.vocab_size, cfg.dim) # token embedding self.pos_embed = Embedding( cfg.max_len, cfg.dim) if position_embeds else None # position embedding self.seg_embed = Embedding( cfg.n_segments, cfg.dim ) if segment_embeds else None # segment(token type) embedding self.norm = LayerNorm(cfg) self.drop = nn.Dropout(cfg.p_drop_hidden)
def __init__(self, hidden_size: int, hidden_act: typing.Union[str, typing.Callable] = 'gelu', layer_norm_eps: float = 1e-12): super().__init__() self.dense = nn.Linear(hidden_size, hidden_size) if isinstance(hidden_act, str): self.transform_act_fn = get_activation_fn(hidden_act) else: self.transform_act_fn = hidden_act self.LayerNorm = LayerNorm(hidden_size, eps=layer_norm_eps)
def __init__(self, config, img_dim, num_answer): super().__init__(config) self.uniter = UniterModel(config, img_dim) self.vqa_output = nn.Sequential( nn.Linear(config.hidden_size, config.hidden_size * 2), GELU(), LayerNorm(config.hidden_size * 2, eps=1e-12), nn.Linear(config.hidden_size * 2, num_answer)) self.apply(self.init_weights) # added MLM task stuff self.cls = BertOnlyMLMHead( config, self.uniter.embeddings.word_embeddings.weight)
def __init__(self, input_size, output_size, dropout_prob, layernorm_epsilon=1.0e-12, input_is_parallel=False, init_method=init.xavier_normal_): super(BertParallelTransformerOutput, self).__init__() # Components. self.dense = RowParallelLinear(input_size, output_size, input_is_parallel=input_is_parallel, init_method=init_method) self.dropout = torch.nn.Dropout(dropout_prob) self.layernorm = LayerNorm(output_size, eps=layernorm_epsilon)
def __init__(self, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, layernorm_epsilon, init_method, output_layer_init_method=None, relative_encoding=False, performer=False, attention_scale=1.0): super(ParallelTransformerLayer, self).__init__() # Set output layer initialization if not provided. if output_layer_init_method is None: output_layer_init_method = init_method # Layernorm on the input data. self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) # Self attention. self.attention = ParallelSelfAttention( hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method, relative_encoding=relative_encoding, performer=performer, attention_scale=attention_scale) # Layernorm on the input data. self.post_attention_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) # MLP self.mlp = ParallelMLP( hidden_size, output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method)
def __init__(self, in_dim, ctx_scope, ksize, nlayer=3): super().__init__() self.net = nn.ModuleList() for i in range(nlayer): self.net.append( nn.Conv1d(in_dim, ctx_scope if i == nlayer - 1 else in_dim, ksize)) self.fnn1 = nn.Linear(ctx_scope, 2 * ctx_scope) self.fnn2 = nn.Linear(2 * ctx_scope, ctx_scope) self.lnv = LayerNorm(in_dim) self.ksize = ksize self.nlayer = nlayer
def __init__(self, embed_dim, hidden_dim, heads=1, dropout=None, rnn=False, residual=True, use_attn=True): super().__init__() # self.attn = PyTorchAttention(embed_dim, heads=heads, dropout=dropout) self.attn = None if use_attn: self.attn = Attention(embed_dim, heads=heads, r=False, dropout=dropout) self.ff = Boom(embed_dim, hidden_dim, dropout=dropout, shortcut=True) self.lnstart = LayerNorm(embed_dim, eps=1e-12) self.lnmid = LayerNorm(embed_dim, eps=1e-12) self.lnmem = LayerNorm(embed_dim, eps=1e-12) self.lnout = LayerNorm(embed_dim, eps=1e-12) self.lnff = LayerNorm(embed_dim, eps=1e-12) self.lnxff = LayerNorm(embed_dim, eps=1e-12) self.drop = nn.Dropout(dropout) self.gelu = GELU() self.residual = residual self.rnn = None if rnn: self.rnn = nn.LSTM(input_size=embed_dim, hidden_size=embed_dim, batch_first=False) if rnn not in [True, False]: self.rnn = rnn
def __init__(self, config, img_dim, num_answer): super().__init__(config) self.uniter = VillaModel(config, img_dim) self.vqa_output = nn.Sequential( nn.Linear(config.hidden_size, config.hidden_size * 2), GELU(), LayerNorm(config.hidden_size * 2, eps=1e-12), nn.Linear(config.hidden_size * 2, num_answer)) self.apply(self.init_weights) self.img_noise = None self.txt_noise = None self.kl = nn.KLDivLoss(reduction='mean') self.adv_grad_scale = 0.1
def __init__(self, config): super(BertEmbeddings, self).__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = LayerNorm(config.hidden_size, eps=1e-12) self.dropout = nn.Dropout(config.hidden_dropout_prob)