Пример #1
0
 def __init__(self, n_ctx, config, scale=False):
     super(Block, self).__init__()
     nx = config.n_embd
     self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
     self.attn = Attention(nx, n_ctx, config, scale)
     self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
     self.mlp = MLP(4 * nx, config)
Пример #2
0
    def __init__(self,
                 hidden_size,
                 num_attention_heads,
                 attention_dropout_prob,
                 output_dropout_prob,
                 layernorm_epsilon,
                 init_method,
                 output_layer_init_method=None):
        super(GPT2ParallelTransformerLayer, self).__init__()
        # Set output layer initialization if not provided.
        if output_layer_init_method is None:
            output_layer_init_method = init_method

        # Layernorm on the input data.
        self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)

        # Self attention.
        self.attention = GPT2ParallelSelfAttention(
            hidden_size,
            num_attention_heads,
            attention_dropout_prob,
            output_dropout_prob,
            init_method,
            output_layer_init_method=output_layer_init_method)

        # Layernorm on the input data.
        self.post_attention_layernorm = LayerNorm(hidden_size,
                                                  eps=layernorm_epsilon)

        # MLP
        self.mlp = GPT2ParallelMLP(
            hidden_size,
            output_dropout_prob,
            init_method,
            output_layer_init_method=output_layer_init_method)
    def __init__(
        self,
        n_heads,
        embedding_size,
        ffn_size,
        attention_dropout=0.0,
        relu_dropout=0.0,
        dropout=0.0,
        activation='relu',
        variant='aiayn',
    ):
        super().__init__()
        self.dim = embedding_size
        self.ffn_dim = ffn_size
        self.variant = variant
        self.activation = activation
        self.dropout = nn.Dropout(p=dropout)

        self.self_attention = MultiHeadAttention(n_heads,
                                                 embedding_size,
                                                 dropout=attention_dropout)
        self.norm1 = LayerNorm(embedding_size, eps=LAYER_NORM_EPS)

        self.encoder_attention = MultiHeadAttention(n_heads,
                                                    embedding_size,
                                                    dropout=attention_dropout)
        self.norm2 = LayerNorm(embedding_size, eps=LAYER_NORM_EPS)

        self.ffn = TransformerFFN(embedding_size,
                                  ffn_size,
                                  relu_dropout=relu_dropout,
                                  activation=activation)
        self.norm3 = LayerNorm(embedding_size, eps=LAYER_NORM_EPS)
Пример #4
0
    def __init__(self, attention_mask_func, mlp_activation_func, init_method,
                 output_layer_init_method, layer_number):
        args = get_args()

        super(ParallelTransformerLayer, self).__init__()
        self.layer_number = layer_number

        self.apply_residual_connection_post_layernorm \
            = args.apply_residual_connection_post_layernorm

        # Layernorm on the input data.
        self.input_layernorm = LayerNorm(args.hidden_size,
                                         eps=args.layernorm_epsilon)

        # Self attention.
        self.attention = ParallelSelfAttention(attention_mask_func,
                                               init_method,
                                               output_layer_init_method,
                                               layer_number)

        # Layernorm on the input data.
        self.post_attention_layernorm = LayerNorm(args.hidden_size,
                                                  eps=args.layernorm_epsilon)

        # MLP
        self.mlp = ParallelMLP(mlp_activation_func, init_method,
                               output_layer_init_method)
Пример #5
0
 def __init__(self, blocks, hidden_dim):
     super().__init__()
     self.wq = modules.Linear(hidden_dim, hidden_dim)
     self.wk = modules.Linear(hidden_dim, hidden_dim)
     self.wv = modules.Linear(hidden_dim, hidden_dim)
     self.lnq = LayerNorm(hidden_dim)
     self.lnk = LayerNorm(hidden_dim)
     self.lnv = LayerNorm(hidden_dim)
Пример #6
0
    def __init__(self, embed_dim, hidden_dim, max_mem, nhead = 1, dropp = 0.1, residual = True, rnn = True, use_attn = True):
        super().__init__()
        self.attn = None
        if use_attn: self.attn = modules.EMultiHeadAttention(embed_dim, hidden_dim, nhead = nhead, dropp = dropp, residual = residual)
        if rnn: self.rnn = LSTM(embed_dim, embed_dim, batch_first = False)
        self.ff = modules.Boom(embed_dim, hidden_dim, dropp = dropp, shortcut = True, residual = residual)

        self.max_mem = max_mem
        self.lnmid = LayerNorm(embed_dim, eps=1e-12)
        self.lnmem = LayerNorm(embed_dim, eps=1e-12)
        self.lnout = LayerNorm(embed_dim, eps=1e-12)
        self.lnff = LayerNorm(embed_dim, eps=1e-12)
        self.gelu = modules.GELU()   
Пример #7
0
    def __init__(self,
                 hidden_size,
                 num_attention_heads,
                 attention_dropout_prob,
                 output_dropout_prob,
                 layernorm_epsilon,
                 init_method,
                 output_layer_init_method=None,
                 num_experts=1):
        super(GPT2ParallelTransformerLayer, self).__init__()
        # Set output layer initialization if not provided.
        if output_layer_init_method is None:
            output_layer_init_method = init_method

        # Layernorm on the input data.
        self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)

        # Self attention.
        self.attention = GPT2ParallelSelfAttention(
            hidden_size,
            num_attention_heads,
            attention_dropout_prob,
            output_dropout_prob,
            init_method,
            output_layer_init_method=output_layer_init_method)

        # Layernorm on the input data.
        self.post_attention_layernorm = LayerNorm(hidden_size,
                                                  eps=layernorm_epsilon)

        # MLP
        if num_experts == 1:
            self.mlp = GPT2ParallelMLP(
                hidden_size,
                output_dropout_prob,
                init_method,
                output_layer_init_method=output_layer_init_method)
        else:
            from deepspeed.moe.layer import MoE
            # Use the DeepSpeed API to use MoE layer and experts.
            # -- sharding, comm. and parameter handling will be done inside DeepSpeed
            self.mlp = MoE(
                hidden_size,
                output_dropout_prob,
                GPT2ParallelMLP(
                    hidden_size,
                    output_dropout_prob,
                    init_method,
                    output_layer_init_method=output_layer_init_method),
                num_experts=num_experts)
Пример #8
0
    def __init__(self,
                 config,
                 img_dim,
                 loss="cls",
                 margin=0.2,
                 hard_ratio=0.3,
                 mlp=1):
        super().__init__(config)
        self.uniter = UniterModel(config, img_dim)
        if mlp == 1:
            self.re_output = nn.Linear(config.hidden_size, 1)
        elif mlp == 2:
            self.re_output = nn.Sequential(
                nn.Linear(config.hidden_size, config.hidden_size), GELU(),
                LayerNorm(config.hidden_size, eps=1e-12),
                nn.Linear(config.hidden_size, 1))
        else:
            raise ValueError("MLP restricted to be 1 or 2 layers.")
        self.loss = loss
        assert self.loss in ['cls', 'rank']
        if self.loss == 'rank':
            self.margin = margin
            self.hard_ratio = hard_ratio
        else:
            self.crit = nn.CrossEntropyLoss(reduction='none')

        self.apply(self.init_weights)
Пример #9
0
    def __init__(self, in_dim, out_dim, cache_len):
        super().__init__()
        self.cache_len = cache_len
        # self.lnq = nn.BatchNorm1d(in_dim, in_dim)
        # self.lnk = nn.BatchNorm1d(in_dim, in_dim)
        # self.lnv = nn.BatchNorm1d(in_dim, in_dim)

        self.lnq = LayerNorm(in_dim)
        self.lnk = LayerNorm(in_dim)
        self.lnv = LayerNorm(in_dim)

        self.wq = nn.Linear(in_dim, out_dim)
        self.wk = nn.Linear(in_dim, out_dim)
        self.wv = nn.Linear(in_dim, out_dim)

        self.hyper_net = indicator(in_dim, cache_len, 5)
Пример #10
0
 def __init__(self,
              nhid,
              q=True,
              k=False,
              v=False,
              r=False,
              heads=1,
              dropout=None):
     super().__init__()
     self.qs = nn.Parameter(
         torch.zeros(size=(1, 1, nhid), dtype=torch.float))
     self.ks = nn.Parameter(
         torch.zeros(size=(1, 1, nhid), dtype=torch.float))
     self.vs = nn.Parameter(
         torch.zeros(size=(1, 1, nhid), dtype=torch.float))
     self.qkvs = nn.Parameter(
         torch.zeros(size=(1, 3, nhid), dtype=torch.float))
     self.heads = heads
     self.nhid = nhid
     assert nhid % self.heads == 0, "Heads must divide vector evenly"
     self.drop = nn.Dropout(dropout) if dropout else None
     self.gelu = GELU()
     self.q = nn.Linear(nhid, nhid) if q else None
     self.qln = LayerNorm(nhid, eps=1e-12)
     self.k = nn.Linear(nhid, nhid) if k else None
     self.v = nn.Linear(nhid, nhid) if v else None
     self.r = nn.Linear(2 * nhid, nhid) if r else None
     self.r_gate = nn.Parameter(
         torch.ones(size=(1, 1, nhid), dtype=torch.float))
     self.vq = None
     self.vq = Overparam(nhid)
     # from fastai.text.models import QRNNLayer
     # self.vq = QRNNLayer(input_size=nhid, hidden_size=nhid, save_prev_x=False, zoneout=0, window=1, output_gate=False, batch_first=False)
     self.vq_collapsed = False
Пример #11
0
 def __init__(self, config, eps=1e-12):
     super().__init__()
     self.intermediate = nn.Linear(config.hidden_size,
                                   config.intermediate_size)
     self.output = nn.Linear(config.intermediate_size, config.hidden_size)
     self.dropout = nn.Dropout(config.hidden_dropout_prob)
     self.layer_norm = LayerNorm(config.hidden_size, eps=eps)
Пример #12
0
    def __init__(self, model_config, template):
        super().__init__()
        self.model_name = 'mplstm'
        self.embed_dim = model_config.embed_dim
        self.hidden_dim = model_config.hidden_dim
        self.nlayer = model_config.nlayer
        print('Model [{}] has been built.'.format(self.model_name))

        self.cache_len = 200
        self.dropi = nn.Dropout(model_config.dropp)

        self.atts = nn.ModuleList()
        self.fnns = nn.ModuleList()
        self.rnns = nn.ModuleList()
        self.rnln = nn.ModuleList()
        for i in range(model_config.nlayer):
            self.atts.append(
                add_transformer(self.hidden_dim, self.hidden_dim,
                                self.cache_len))
            self.fnns.append(Boom(self.hidden_dim))
            self.rnns.append(
                nn.LSTM(self.hidden_dim,
                        self.hidden_dim,
                        model_config.nlayer,
                        batch_first=True))
            self.rnln.append(LayerNorm(self.hidden_dim))
Пример #13
0
    def __init__(self, in_dim, out_dim, cache_len):
        super().__init__()
        self.cache_len = cache_len

        self.lnq = LayerNorm(in_dim)
        self.lnk = LayerNorm(in_dim)
        self.lnv = LayerNorm(in_dim)

        self.wq = nn.Linear(in_dim, out_dim)
        self.wk = nn.Linear(in_dim, out_dim)
        self.wv = nn.Linear(in_dim, out_dim)

        self.hyper_kernel = 5
        self.hyper_layer = 1
        self.hyper_net = indicator(in_dim, cache_len, self.hyper_kernel,
                                   self.hyper_layer)
Пример #14
0
    def __init__(self, hidden_size, feat_dim, img_linear_weight):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(hidden_size, hidden_size), GELU(),
                                 LayerNorm(hidden_size, eps=1e-12))

        self.weight = img_linear_weight
        self.bias = nn.Parameter(torch.zeros(feat_dim))
Пример #15
0
    def __init__(self,
                 hidden_size,
                 num_attention_heads,
                 attention_dropout_prob,
                 output_dropout_prob,
                 layernorm_epsilon,
                 init_method,
                 output_layer_init_method=None):
        super(ParallelDecoderLayer, self).__init__()
        # Set output layer initialization if not provided.
        if output_layer_init_method is None:
            output_layer_init_method = init_method

        # Layernorm on the input data.
        self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)

        # Self attention.
        self.self_attention = ParallelSelfAttention(
            hidden_size,
            num_attention_heads,
            attention_dropout_prob,
            output_dropout_prob,
            init_method,
            output_layer_init_method=output_layer_init_method)

        # Layernorm after the self attention.
        self.post_self_layernorm = LayerNorm(hidden_size,
                                             eps=layernorm_epsilon)

        self.cross_attention = ParallelCrossAttention(
            hidden_size,
            num_attention_heads,
            attention_dropout_prob,
            output_dropout_prob,
            init_method,
            output_layer_init_method=output_layer_init_method)

        # Layernorm after the cross attention.
        self.post_attention_layernorm = LayerNorm(hidden_size,
                                                  eps=layernorm_epsilon)

        # MLP
        self.mlp = ParallelMLP(
            hidden_size,
            output_dropout_prob,
            init_method,
            output_layer_init_method=output_layer_init_method)
Пример #16
0
 def __init__(self, config, img_dim, num_answer):
     super().__init__(config)
     self.roberta = VLXLMRModel(config, img_dim)
     self.vqa_output = nn.Sequential(
         nn.Linear(config.hidden_size, config.hidden_size * 2), GELU(),
         LayerNorm(config.hidden_size * 2, eps=config.layer_norm_eps),
         nn.Linear(config.hidden_size * 2, num_answer))
     self.apply(self.init_weights)
Пример #17
0
 def __init__(self, config, img_dim, num_answer):
     super().__init__(config)
     self.bert = UniterModel(config, img_dim)
     self.vqa_output = nn.Sequential(
         nn.Linear(config.hidden_size, config.hidden_size * 2), GELU(),
         LayerNorm(config.hidden_size * 2, eps=1e-12),
         nn.Linear(config.hidden_size * 2, num_answer))
     self.apply(self.init_weights)
Пример #18
0
 def __init__(self, hidden_size, label_dim):
     super().__init__()
     self.net = nn.Sequential(
         nn.Linear(hidden_size, hidden_size),
         GELU(),
         LayerNorm(hidden_size, eps=1e-12),
         nn.Linear(hidden_size, label_dim),
     )
Пример #19
0
 def __init__(self, config, img_dim):
     super().__init__(config, img_dim)
     self.uniter = UniterModel(config, img_dim)
     self.vcr_output = nn.Sequential(
         nn.Linear(config.hidden_size, config.hidden_size * 2), nn.ReLU(),
         LayerNorm(config.hidden_size * 2, eps=1e-12),
         nn.Linear(config.hidden_size * 2, 2))
     self.apply(self.init_weights)
Пример #20
0
    def __init__(self,
                 num_layers,
                 hidden_size,
                 num_attention_heads,
                 attention_dropout_prob,
                 output_dropout_prob,
                 checkpoint_activations,
                 checkpoint_num_layers=1,
                 layernorm_epsilon=1.0e-5,
                 init_method_std=0.02,
                 use_scaled_init_for_output_weights=True,
                 use_deepspeed_sparse=None,
                 sparse_mode='all'):
        super(GPT3ParallelTransformer, self).__init__()

        if DEEPSPEED_WRAP:
            from deepspeed.ops.sparse_attention import SparseSelfAttention

        # Store activation checkpoiting flag.
        self.checkpoint_activations = checkpoint_activations
        self.checkpoint_num_layers = checkpoint_num_layers

        output_layer_init_method = None
        if use_scaled_init_for_output_weights:
            output_layer_init_method = scaled_init_method(
                init_method_std, num_layers)
        if use_deepspeed_sparse and sparse_mode == 'alternating':
            print('Use alternating sparse & dense attention layers')

        def get_layer(layer_num, num_layers):
            sparsity_config = use_deepspeed_sparse
            if use_deepspeed_sparse:
                if sparse_mode == 'alternating' and layer_num % 2:  # even layers are dense
                    sparsity_config = None
                elif sparse_mode == 'top_bottom' and layer_num >= num_layers // 2:  # top levels are dense
                    sparsity_config = None
            return GPT3ParallelTransformerLayer(
                hidden_size,
                num_attention_heads,
                attention_dropout_prob,
                output_dropout_prob,
                layernorm_epsilon,
                unscaled_init_method(init_method_std),
                output_layer_init_method=output_layer_init_method,
                use_deepspeed_sparse=sparsity_config)

        # Transformer layers.
        self.layers = torch.nn.ModuleList(
            [get_layer(i, num_layers) for i in range(num_layers)])

        # Final layer norm before output.
        self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)

        if DEEPSPEED_WRAP:
            if DEEPSPEED_WRAP.deepspeed.checkpointing.is_configured():
                global get_cuda_rng_tracker, checkpoint
                get_cuda_rng_tracker = DEEPSPEED_WRAP.deepspeed.checkpointing.get_cuda_rng_tracker
                checkpoint = DEEPSPEED_WRAP.deepspeed.checkpointing.checkpoint
Пример #21
0
 def __init__(self, n_hidden, n_layer, n_head=6, mm=True):
     super().__init__()
     self.n_head = n_head
     self.att = nn.ModuleList()
     for l in range(n_layer):
         en = MultiHeadBlock(n_hidden, n_head=n_head)
         ln = LayerNorm(n_hidden)
         # ln = apex.normalization.FusedLayerNorm(n_hidden)
         self.att.append(nn.Sequential(en, ln))
Пример #22
0
 def __init__(self, cfg, position_embeds=True, segment_embeds=True):
     super().__init__()
     self.tok_embed = Embedding(cfg.vocab_size, cfg.dim)  # token embedding
     self.pos_embed = Embedding(
         cfg.max_len,
         cfg.dim) if position_embeds else None  # position embedding
     self.seg_embed = Embedding(
         cfg.n_segments, cfg.dim
     ) if segment_embeds else None  # segment(token type) embedding
     self.norm = LayerNorm(cfg)
     self.drop = nn.Dropout(cfg.p_drop_hidden)
 def __init__(self,
              hidden_size: int,
              hidden_act: typing.Union[str, typing.Callable] = 'gelu',
              layer_norm_eps: float = 1e-12):
     super().__init__()
     self.dense = nn.Linear(hidden_size, hidden_size)
     if isinstance(hidden_act, str):
         self.transform_act_fn = get_activation_fn(hidden_act)
     else:
         self.transform_act_fn = hidden_act
     self.LayerNorm = LayerNorm(hidden_size, eps=layer_norm_eps)
Пример #24
0
 def __init__(self, config, img_dim, num_answer):
     super().__init__(config)
     self.uniter = UniterModel(config, img_dim)
     self.vqa_output = nn.Sequential(
         nn.Linear(config.hidden_size, config.hidden_size * 2), GELU(),
         LayerNorm(config.hidden_size * 2, eps=1e-12),
         nn.Linear(config.hidden_size * 2, num_answer))
     self.apply(self.init_weights)
     # added MLM task stuff
     self.cls = BertOnlyMLMHead(
         config, self.uniter.embeddings.word_embeddings.weight)
Пример #25
0
 def __init__(self, input_size, output_size, dropout_prob,
              layernorm_epsilon=1.0e-12, input_is_parallel=False,
              init_method=init.xavier_normal_):
     super(BertParallelTransformerOutput, self).__init__()
     # Components.
     self.dense = RowParallelLinear(input_size,
                                    output_size,
                                    input_is_parallel=input_is_parallel,
                                    init_method=init_method)
     self.dropout = torch.nn.Dropout(dropout_prob)
     self.layernorm = LayerNorm(output_size, eps=layernorm_epsilon)
Пример #26
0
    def __init__(self,
                 hidden_size,
                 num_attention_heads,
                 attention_dropout_prob,
                 output_dropout_prob,
                 layernorm_epsilon,
                 init_method,
                 output_layer_init_method=None,
                 relative_encoding=False,
                 performer=False,
                 attention_scale=1.0):
        super(ParallelTransformerLayer, self).__init__()
        # Set output layer initialization if not provided.
        if output_layer_init_method is None:
            output_layer_init_method = init_method

        # Layernorm on the input data.
        self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)

        # Self attention.
        self.attention = ParallelSelfAttention(
            hidden_size,
            num_attention_heads,
            attention_dropout_prob,
            output_dropout_prob,
            init_method,
            output_layer_init_method=output_layer_init_method,
            relative_encoding=relative_encoding,
            performer=performer,
            attention_scale=attention_scale)

        # Layernorm on the input data.
        self.post_attention_layernorm = LayerNorm(hidden_size,
                                                  eps=layernorm_epsilon)

        # MLP
        self.mlp = ParallelMLP(
            hidden_size,
            output_dropout_prob,
            init_method,
            output_layer_init_method=output_layer_init_method)
Пример #27
0
 def __init__(self, in_dim, ctx_scope, ksize, nlayer=3):
     super().__init__()
     self.net = nn.ModuleList()
     for i in range(nlayer):
         self.net.append(
             nn.Conv1d(in_dim, ctx_scope if i == nlayer - 1 else in_dim,
                       ksize))
     self.fnn1 = nn.Linear(ctx_scope, 2 * ctx_scope)
     self.fnn2 = nn.Linear(2 * ctx_scope, ctx_scope)
     self.lnv = LayerNorm(in_dim)
     self.ksize = ksize
     self.nlayer = nlayer
Пример #28
0
    def __init__(self,
                 embed_dim,
                 hidden_dim,
                 heads=1,
                 dropout=None,
                 rnn=False,
                 residual=True,
                 use_attn=True):
        super().__init__()
        # self.attn = PyTorchAttention(embed_dim, heads=heads, dropout=dropout)
        self.attn = None
        if use_attn:
            self.attn = Attention(embed_dim,
                                  heads=heads,
                                  r=False,
                                  dropout=dropout)
        self.ff = Boom(embed_dim, hidden_dim, dropout=dropout, shortcut=True)
        self.lnstart = LayerNorm(embed_dim, eps=1e-12)
        self.lnmid = LayerNorm(embed_dim, eps=1e-12)
        self.lnmem = LayerNorm(embed_dim, eps=1e-12)
        self.lnout = LayerNorm(embed_dim, eps=1e-12)
        self.lnff = LayerNorm(embed_dim, eps=1e-12)
        self.lnxff = LayerNorm(embed_dim, eps=1e-12)
        self.drop = nn.Dropout(dropout)
        self.gelu = GELU()
        self.residual = residual

        self.rnn = None
        if rnn:
            self.rnn = nn.LSTM(input_size=embed_dim,
                               hidden_size=embed_dim,
                               batch_first=False)
            if rnn not in [True, False]:
                self.rnn = rnn
Пример #29
0
    def __init__(self, config, img_dim, num_answer):
        super().__init__(config)
        self.uniter = VillaModel(config, img_dim)
        self.vqa_output = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size * 2), GELU(),
            LayerNorm(config.hidden_size * 2, eps=1e-12),
            nn.Linear(config.hidden_size * 2, num_answer))
        self.apply(self.init_weights)

        self.img_noise = None
        self.txt_noise = None
        self.kl = nn.KLDivLoss(reduction='mean')
        self.adv_grad_scale = 0.1
    def __init__(self, config):
        super(BertEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size,
                                            config.hidden_size)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
                                                config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
                                                  config.hidden_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)