Exemplo n.º 1
0
    def __init__(self, nx, n_ctx, config, scale=False):
        super().__init__()
        self.output_attentions = config.output_attentions

        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
        assert n_state % config.n_head == 0
        self.register_buffer(
            "bias",
            torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale

        self.c_attn = Conv1D(n_state * 3, nx)
        self.c_proj = Conv1D(n_state, nx)
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
        self.pruned_heads = set()
Exemplo n.º 2
0
    def __init__(self, nx, n_ctx, config, scale=False):
        super().__init__()
        self.output_attentions = config.output_attentions

        n_state = nx  # in Attention : n_state=768 (nx=n_embd)
        assert n_state % config.n_head == 0  # 768/12=64, 나머지 0
        self.register_buffer(
            "bias",
            torch.tril(torch.ones(n_ctx, n_ctx)).view(
                1, 1, n_ctx,
                n_ctx))  # n_ctx크기로 텐서를 바꾸고, 1로 채운다음 앞에서부터 tril해간다.
        self.n_head = config.n_head  # head 개수
        self.split_size = n_state  # 스플릿사이즈는 n_state
        self.scale = scale  # scale=False

        self.c_attn = Conv1D(
            n_state * 3, nx)  # 어텐션 학습가능한 매트릭스1 : 사이즈 768 * 3(q, k, v 3개), 768
        self.c_proj = Conv1D(n_state, nx)  # 어텐션 학습가능한 매트릭스2 : 사이즈 768, 768
        self.attn_dropout = nn.Dropout(config.attn_pdrop)  # 어텐션 부분 드랍아웃 비율
        self.resid_dropout = nn.Dropout(config.resid_pdrop)  # 레지듀얼 커넥션 드랍아웃 비율
        self.pruned_heads = set()  # prune heads는 집합
    def __init__(self,
                 nx,
                 n_ctx,
                 config,
                 scale=False,
                 is_cross_attention=False):
        super().__init__()

        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
        assert n_state % config.n_head == 0

        if config.static_attention_mask is not None:
            self.register_buffer(
                "bias",
                (torch.tril(torch.ones(
                    (n_ctx, n_ctx), dtype=torch.uint8)).view(
                        1, 1, n_ctx, n_ctx) * config.static_attention_mask))
        else:
            self.register_buffer(
                "bias",
                torch.tril(torch.ones(
                    (n_ctx, n_ctx),
                    dtype=torch.uint8)).view(1, 1, n_ctx, n_ctx))

        self.register_buffer("masked_bias", torch.tensor(-1e4))
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale
        self.is_cross_attention = is_cross_attention
        if self.is_cross_attention:
            self.c_attn = Conv1D(2 * n_state, nx)
            self.q_attn = Conv1D(n_state, nx)
        else:
            self.c_attn = Conv1D(3 * n_state, nx)
        self.c_proj = Conv1D(n_state, nx)
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
        self.pruned_heads = set()
Exemplo n.º 4
0
    def __init__(self, config):
        super(GPT2Model, self).__init__(config)
        self.output_hidden_states = config.output_hidden_states
        self.output_attentions = config.output_attentions
        self.output_past = config.output_past

        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
        self.drop = nn.Dropout(config.embd_pdrop)

        # manually modify number of layers in encoder to accommodate GPU memory
        n = 6  # config.n_layer
        self.h = nn.ModuleList([Unmasked_Block(config.n_ctx, config, scale=True) for _ in range(n)])

        self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)

        self.init_weights()

        # added code here
        self.averageSelfAttention = AverageSelfAttention(config.n_embd)
        nx = config.n_embd
        nz = config.n_embd
        self.mean = Conv1D(nz, nx)
        self.logvar = Conv1D(nz, nx)
Exemplo n.º 5
0
    def __init__(self, config, add_input=False, add_attn=False, add_softmax=False, attn_proj_vary=False, learn_prior=False):
        super(GPT2LMHeadModel, self).__init__(config)

        # add code here
        self.add_input = add_input
        self.add_attn = add_attn
        self.add_softmax = add_softmax
        self.attn_proj_vary = attn_proj_vary
        self.learn_prior = learn_prior

        self.transformer = Decoder(config, add_input, add_attn, attn_proj_vary)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        self.encoder = Encoder(config)
        if self.learn_prior:
            self.encoder_prior = Encoder(config)

        if self.add_softmax:
            nz = config.n_embd
            self.lm_head_rep = Conv1D(config.vocab_size, nz)
 def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
     super().__init__()
     nx = config.n_embd
     self.c_fc = Conv1D(n_state, nx)
     self.c_proj = Conv1D(nx, n_state)
     self.dropout = nn.Dropout(config.resid_pdrop)