def __init__(self, nx, n_ctx, config, scale=False): super().__init__() self.output_attentions = config.output_attentions n_state = nx # in Attention: n_state=768 (nx=n_embd) # [switch nx => n_state from Block to Attention to keep identical to TF implem] assert n_state % config.n_head == 0 self.register_buffer( "bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx)) self.n_head = config.n_head self.split_size = n_state self.scale = scale self.c_attn = Conv1D(n_state * 3, nx) self.c_proj = Conv1D(n_state, nx) self.attn_dropout = nn.Dropout(config.attn_pdrop) self.resid_dropout = nn.Dropout(config.resid_pdrop) self.pruned_heads = set()
def __init__(self, nx, n_ctx, config, scale=False): super().__init__() self.output_attentions = config.output_attentions n_state = nx # in Attention : n_state=768 (nx=n_embd) assert n_state % config.n_head == 0 # 768/12=64, 나머지 0 self.register_buffer( "bias", torch.tril(torch.ones(n_ctx, n_ctx)).view( 1, 1, n_ctx, n_ctx)) # n_ctx크기로 텐서를 바꾸고, 1로 채운다음 앞에서부터 tril해간다. self.n_head = config.n_head # head 개수 self.split_size = n_state # 스플릿사이즈는 n_state self.scale = scale # scale=False self.c_attn = Conv1D( n_state * 3, nx) # 어텐션 학습가능한 매트릭스1 : 사이즈 768 * 3(q, k, v 3개), 768 self.c_proj = Conv1D(n_state, nx) # 어텐션 학습가능한 매트릭스2 : 사이즈 768, 768 self.attn_dropout = nn.Dropout(config.attn_pdrop) # 어텐션 부분 드랍아웃 비율 self.resid_dropout = nn.Dropout(config.resid_pdrop) # 레지듀얼 커넥션 드랍아웃 비율 self.pruned_heads = set() # prune heads는 집합
def __init__(self, nx, n_ctx, config, scale=False, is_cross_attention=False): super().__init__() n_state = nx # in Attention: n_state=768 (nx=n_embd) # [switch nx => n_state from Block to Attention to keep identical to TF implem] assert n_state % config.n_head == 0 if config.static_attention_mask is not None: self.register_buffer( "bias", (torch.tril(torch.ones( (n_ctx, n_ctx), dtype=torch.uint8)).view( 1, 1, n_ctx, n_ctx) * config.static_attention_mask)) else: self.register_buffer( "bias", torch.tril(torch.ones( (n_ctx, n_ctx), dtype=torch.uint8)).view(1, 1, n_ctx, n_ctx)) self.register_buffer("masked_bias", torch.tensor(-1e4)) self.n_head = config.n_head self.split_size = n_state self.scale = scale self.is_cross_attention = is_cross_attention if self.is_cross_attention: self.c_attn = Conv1D(2 * n_state, nx) self.q_attn = Conv1D(n_state, nx) else: self.c_attn = Conv1D(3 * n_state, nx) self.c_proj = Conv1D(n_state, nx) self.attn_dropout = nn.Dropout(config.attn_pdrop) self.resid_dropout = nn.Dropout(config.resid_pdrop) self.pruned_heads = set()
def __init__(self, config): super(GPT2Model, self).__init__(config) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions self.output_past = config.output_past self.wte = nn.Embedding(config.vocab_size, config.n_embd) self.wpe = nn.Embedding(config.n_positions, config.n_embd) self.drop = nn.Dropout(config.embd_pdrop) # manually modify number of layers in encoder to accommodate GPU memory n = 6 # config.n_layer self.h = nn.ModuleList([Unmasked_Block(config.n_ctx, config, scale=True) for _ in range(n)]) self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.init_weights() # added code here self.averageSelfAttention = AverageSelfAttention(config.n_embd) nx = config.n_embd nz = config.n_embd self.mean = Conv1D(nz, nx) self.logvar = Conv1D(nz, nx)
def __init__(self, config, add_input=False, add_attn=False, add_softmax=False, attn_proj_vary=False, learn_prior=False): super(GPT2LMHeadModel, self).__init__(config) # add code here self.add_input = add_input self.add_attn = add_attn self.add_softmax = add_softmax self.attn_proj_vary = attn_proj_vary self.learn_prior = learn_prior self.transformer = Decoder(config, add_input, add_attn, attn_proj_vary) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.encoder = Encoder(config) if self.learn_prior: self.encoder_prior = Encoder(config) if self.add_softmax: nz = config.n_embd self.lm_head_rep = Conv1D(config.vocab_size, nz)
def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd) super().__init__() nx = config.n_embd self.c_fc = Conv1D(n_state, nx) self.c_proj = Conv1D(nx, n_state) self.dropout = nn.Dropout(config.resid_pdrop)