def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads, dropout, layers, smoothing_factor, approx): super(BIGLM, self).__init__() self.vocab = vocab self.embed_dim = embed_dim self.tok_embed = Embedding(self.vocab.size, embed_dim, self.vocab.padding_idx) self.pos_embed = LearnedPositionalEmbedding(embed_dim, device=local_rank) self.layers = nn.ModuleList() for i in range(layers): self.layers.append(TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout, with_external=True)) self.emb_layer_norm = LayerNorm(embed_dim) self.one_more = nn.Linear(embed_dim, embed_dim) self.one_more_layer_norm = LayerNorm(embed_dim) self.out_proj = nn.Linear(embed_dim, self.vocab.size) self.attn_mask = SelfAttentionMask(device=local_rank) self.smoothing = LabelSmoothing(local_rank, self.vocab.size, self.vocab.padding_idx, smoothing_factor) self.dropout = dropout self.device = local_rank if approx == "none": self.approx = None elif approx == "adaptive": self.approx = nn.AdaptiveLogSoftmaxWithLoss(self.embed_dim, self.vocab.size, [10000, 20000, 200000]) else: raise NotImplementedError("%s has not been implemented"%approx) self.reset_parameters()
def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads, dropout, layers, approx): super(BERTLM, self).__init__() self.vocab = vocab self.embed_dim =embed_dim self.tok_embed = Embedding(self.vocab.size, embed_dim, self.vocab.padding_idx) self.pos_embed = LearnedPositionalEmbedding(embed_dim, device=local_rank) self.seg_embed = Embedding(2, embed_dim, None) self.out_proj_bias = nn.Parameter(torch.Tensor(self.vocab.size)) self.layers = nn.ModuleList() for i in range(layers): self.layers.append(TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout)) self.emb_layer_norm = LayerNorm(embed_dim) self.one_more = nn.Linear(embed_dim, embed_dim) self.one_more_layer_norm = LayerNorm(embed_dim) self.one_more_nxt_snt = nn.Linear(embed_dim, embed_dim) self.nxt_snt_pred = nn.Linear(embed_dim, 1) self.dropout = dropout self.device = local_rank if approx == "none": self.approx = None elif approx == "adaptive": self.approx = nn.AdaptiveLogSoftmaxWithLoss(self.embed_dim, self.vocab.size, [10000, 20000, 200000]) else: raise NotImplementedError("%s has not been implemented"%approx) self.reset_parameters()
def __init__(self, dim, heads, max_len): super().__init__() self.attention = Attention(dim, heads, max_len) self.norm1 = LayerNorm(dim) self.ff = feedforward(dim, heads, max_len) self.norm2 = LayerNorm(dim) self.drop = nn.Dropout(0.1)
def __init__(self, embed_dim, ff_embed_dim, num_heads, dropout, with_external=False, weights_dropout = True): super(TransformerLayer, self).__init__() self.self_attn = MultiheadAttention(embed_dim, num_heads, dropout, weights_dropout) self.fc1 = nn.Linear(embed_dim, ff_embed_dim) self.fc2 = nn.Linear(ff_embed_dim, embed_dim) self.attn_layer_norm = LayerNorm(embed_dim) self.ff_layer_norm = LayerNorm(embed_dim) self.with_external = with_external self.dropout = dropout if self.with_external: self.external_attn = MultiheadAttention(embed_dim, num_heads, dropout, weights_dropout) self.external_layer_norm = LayerNorm(embed_dim) self.reset_parameters()
def __init__(self, opt, padding_idx4item=0, padding_idx4prefer=0): super().__init__() # self.pad_idx, self.start_idx, self.end_idx) self.batch_size = opt['batch_size'] self.max_length = opt['max_length'] self.dropout = opt['dropout'] self.num_layers = 2 #opt['num_layers'] self.vocab_size = opt['vocab_size'] self.user_size = opt['user_size'] self.dim = opt['dim'] self.embedding_size = opt['embedding_size'] self.pad_idx4item = padding_idx4item self.pad_idx4prefer = padding_idx4prefer self.embeddings = _create_embeddings(self.vocab_size, self.embedding_size, self.pad_idx4item) self.user_embeddings = _create_embeddings(self.user_size, self.embedding_size, self.pad_idx4item) self.position_embeddings = nn.Embedding(opt['max_length'], opt['dim']) self.LayerNorm = LayerNorm(opt['dim'], eps=1e-12) self.dropout = nn.Dropout(opt['dropout']) opt['num_layers'] = 2 self.SAS_encoder = Encoder(opt) self.prefer_SAS_encoder = Encoder(opt) self.neg_SAS_encoder = Encoder(opt) self.item_norm = nn.Linear(opt['dim'], opt['dim']) self.criterion = nn.BCELoss() self.cs_loss = nn.CrossEntropyLoss()
def __init__(self, dim, vocab_size, max_len, n_segs): super().__init__() self.embed = nn.Embedding(vocab_size, dim) self.embedpos = nn.Embedding(max_len, dim) self.segembed = nn.Embedding(n_segs, dim) self.norm = LayerNorm(dim) self.drop = nn.Dropout(0.1)
def __init__(self, model, hyper_config): super(standard_layernorm, self).__init__() if torch.cuda.is_available(): self.dtype = torch.cuda.FloatTensor else: self.dtype = torch.FloatTensor self.z_size = model.z_size self.x_size = model.x_size self.act_func = model.act_func #Encoder self.encoder_weights = [] self.layer_norms = [] for i in range(len(hyper_config['encoder_arch'])): self.encoder_weights.append( nn.Linear(hyper_config['encoder_arch'][i][0], hyper_config['encoder_arch'][i][1])) if i != len(hyper_config['encoder_arch']) - 1: self.layer_norms.append( LayerNorm(hyper_config['encoder_arch'][i][1])) count = 1 for i in range(len(self.encoder_weights)): self.add_module(str(count), self.encoder_weights[i]) count += 1 if i != len(hyper_config['encoder_arch']) - 1: self.add_module(str(count), self.layer_norms[i]) count += 1
def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads, dropout, layers, smoothing_factor, approx=None): super(BIGLM, self).__init__() self.vocab = vocab self.embed_dim = embed_dim self.tok_embed = Embedding(self.vocab.size, embed_dim, self.vocab.padding_idx) self.pos_embed = LearnedPositionalEmbedding(embed_dim, device=local_rank) self.layers = nn.ModuleList() for i in range(layers): self.layers.append( TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout, with_external=True)) self.emb_layer_norm = LayerNorm(embed_dim) self.one_more = nn.Linear(embed_dim, embed_dim) self.one_more_layer_norm = LayerNorm(embed_dim) self.out_proj = nn.Linear(embed_dim, self.vocab.size) self.attn_mask = SelfAttentionMask(device=local_rank) self.smoothing = LabelSmoothing(local_rank, self.vocab.size, self.vocab.padding_idx, smoothing_factor) self.dropout = dropout self.device = local_rank self.approx = approx self.reset_parameters()
def __init__(self, dim, heads, max_len, n_seg): super().__init__() self.allenc = AllEncode(dim, heads, max_len, n_seg) self.fc1 = nn.Linear(dim, dim) self.tanh = nn.Tanh() self.fc2 = nn.Linear(dim, 2) self.norm = LayerNorm(dim) embed_weight = self.allenc.embed.embed.weight n_vocab, n_dim = embed_weight.size() self.decoder = nn.Linear(n_dim, n_vocab, bias=False) self.decoder.weight = embed_weight self.linear = nn.Linear(dim, dim)
def __init__(self, hyper_config, seed=1): super(VAE, self).__init__() torch.manual_seed(seed) self.z_size = hyper_config['z_size'] self.x_size = hyper_config['x_size'] self.act_func = hyper_config['act_func'] self.q_dist = hyper_config['q_dist'](self, hyper_config=hyper_config) # for aaa in self.q_dist.parameters(): # # print (aaa) # print (aaa.size()) # # fasdfs if torch.cuda.is_available(): self.dtype = torch.cuda.FloatTensor self.q_dist.cuda() else: self.dtype = torch.FloatTensor #Decoder self.decoder_weights = [] self.layer_norms = [] for i in range(len(hyper_config['decoder_arch'])): self.decoder_weights.append(nn.Linear(hyper_config['decoder_arch'][i][0], hyper_config['decoder_arch'][i][1])) if i != len(hyper_config['decoder_arch'])-1: self.layer_norms.append(LayerNorm(hyper_config['decoder_arch'][i][1])) count =1 for i in range(len(self.decoder_weights)): self.add_module(str(count), self.decoder_weights[i]) count+=1 if i != len(hyper_config['decoder_arch'])-1: self.add_module(str(count), self.layer_norms[i]) count+=1
def __init__(self, local_rank, input_dim=768, ff_dim=2048, num_heads=8, dropout=0.2, layers=6): super(PrefixPredict, self).__init__() self.input_dim = input_dim self.layers = nn.ModuleList() for i in range(layers): self.layers.append( TransformerLayer(input_dim, ff_dim, num_heads, dropout)) self.one_more = nn.Linear(input_dim, input_dim) self.one_more_layer_norm = LayerNorm(input_dim) self.attn_mask = SelfAttentionMask(device=local_rank) self.loss_fun = ContrativeLoss(device=local_rank) self.dropout = dropout self.device = local_rank self.reset_parameters()
def __init__(self, layer, N): super(Decoder, self).__init__() self.layers = clones(layer, N) self.norm = LayerNorm(layer.size)
def __init__(self, layer, n=1): super(Encoder, self).__init__() self.layers = clones(layer, n) self.norm = LayerNorm(layer.size)
def __init__(self, size: int, dropout=0.1): super(SublayerConnection, self).__init__() self.norm = LayerNorm(size) self.dropout = nn.Dropout(p=dropout)